# Experimenting with naive Bayes classifiers
Reference: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#extracting-features-from-text-files

In [1]:
# import libraries
import pandas as pd

In [2]:
# import datasets
train_path = "../data/train.csv"

train_data = pd.read_csv(train_path)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144293 entries, 0 to 144292
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  144293 non-null  object 
 1   discourse_id        144293 non-null  float64
 2   discourse_start     144293 non-null  float64
 3   discourse_end       144293 non-null  float64
 4   discourse_text      144293 non-null  object 
 5   discourse_type      144293 non-null  object 
 6   discourse_type_num  144293 non-null  object 
 7   predictionstring    144293 non-null  object 
dtypes: float64(3), object(5)
memory usage: 8.8+ MB


In [3]:
from sklearn.model_selection import train_test_split
X = train_data['discourse_text']
y = train_data['discourse_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=29)

In [4]:
# CountVectorizer builds a dictionary of features and transforms documents to feature vectors.
# CountVectorizer supports counts of N-grams of words or consecutive characters.
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape


(101005, 41751)

In [5]:
# Once fitted, the vectorizer has built a dictionary of feature indices.
# The index value of a word in the vocabulary is linked to its frequency in the whole training corpus.

print(count_vect.vocabulary_.get(u'and'))
print(count_vect.vocabulary_.get(u'but'))

2295
5610


In [6]:
# normalisation step: from occurrences to frequencies
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


(101005, 41751)

In [7]:
# create and train multinomial classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [8]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

predicted = clf.predict(X_test_tfidf)

for i in range(10):
  print('text: ', X_test.iloc[i])
  print('predicted category: ', predicted[i])
  print('real category: ', y_test.iloc[i])  

text:   Students should not be allowed to design their summer projects 
predicted category:  Claim
real category:  Position
text:  Finally, The FACS could help find if students are depressed or having problems at home.
predicted category:  Claim
real category:  Claim
text:  You might say that not all sports are right after school, and I'd say that is correct,
predicted category:  Evidence
real category:  Counterclaim
text:  Every student should do at least 2 community services. Maybe one at home and one at school
predicted category:  Claim
real category:  Evidence
text:  Social Skills are a key component of learning through any type of school. The interaction between other students is extremely valuable because it builds relationships and confidence. 
predicted category:  Claim
real category:  Claim
text:  Where as if I'm at home I can be free complete things on my time, I can take time on my assignments, I don't have to compare myself to others, and I can be me. At home there is no on

In [9]:
# same thing with pipeline
# from sklearn.pipeline import Pipeline
# text_clf = Pipeline([
#   ('vect', CountVectorizer()),
#   ('tfidf', TfidfTransformer()),
#   ('clf', MultinomialNB()),
# ])
# text_clf.fit(X_train, y_train)


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [10]:
# evaluate performance
import numpy as np
accuracy = np.mean(predicted == y_test)
print('accuracy: ', accuracy )

accuracy:  0.4969275549805951


Better than random! 

7 categories -> approximately 14.2% to randomly get it right. 

It's a start.

In [20]:
discourse_types = list(set(train_data["discourse_type"]))
print('proportions of discourse types in the whole dataset:')
print(train_data["discourse_type"].value_counts())
print('proportions of discourse types in the train dataset:')
print(y_train.value_counts())
print('proportions of discourse types in the test dataset:')
print(y_test.value_counts())

proportions of discourse types in the whole dataset:
Claim                   50208
Evidence                45702
Position                15419
Concluding Statement    13505
Lead                     9305
Counterclaim             5817
Rebuttal                 4337
Name: discourse_type, dtype: int64
proportions of discourse types in the train dataset:
Claim                   35266
Evidence                32073
Position                10792
Concluding Statement     9352
Lead                     6513
Counterclaim             4060
Rebuttal                 2949
Name: discourse_type, dtype: int64
proportions of discourse types in the test dataset:
Claim                   14942
Evidence                13629
Position                 4627
Concluding Statement     4153
Lead                     2792
Counterclaim             1757
Rebuttal                 1388
Name: discourse_type, dtype: int64


In [18]:
from sklearn import metrics
print('confusion matrix for y_test predictions')
print(metrics.confusion_matrix(y_test, predicted))

confusion matrix for y_test predictions
[[10074     2     0  4849     0    17     0]
 [ 2667    19     0  1427     0    40     0]
 [ 1033     1     7   706     0    10     0]
 [ 2975     1     0 10652     0     1     0]
 [ 1246     2     0  1480     9    55     0]
 [ 3144     4     0   729     0   750     0]
 [  600     1     0   781     0     6     0]]


Somehow the normalization steps (?) killed our most uncommon categories!

In [12]:
# Linear support vector machine classifier
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
  ('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42,
                        max_iter=5, tol=None)),
])
text_clf.fit(X_train, y_train)
predicted2 = text_clf.predict(X_test)
accuracy2 = np.mean(predicted2 == y_test)
print('accuracy: ', accuracy2 )

accuracy:  0.5891933099242285


Linear support vector machine improves!

In [13]:
# Parameter fine tuning
from sklearn.model_selection import GridSearchCV
parameters = {
  'vect__ngram_range': [(1, 1), (1, 2)],
  'tfidf__use_idf': (True, False),
  'clf__alpha': (1e-2, 1e-3),
}

In [14]:
# n_jobs = -1 automatically detects the number of CPUs and uses all of them
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [15]:
gs_clf.best_score_
for param_name in sorted(parameters.keys()):
  print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))


clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [16]:
results_df = pd.DataFrame(gs_clf.cv_results_)
results_df


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_tfidf__use_idf,param_vect__ngram_range,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.562967,0.204772,0.642171,0.062093,0.01,True,"(1, 1)","{'clf__alpha': 0.01, 'tfidf__use_idf': True, '...",0.573833,0.569328,0.570516,0.565665,0.565814,0.569031,0.003067,6
1,17.735479,1.435859,2.098346,0.092145,0.01,True,"(1, 2)","{'clf__alpha': 0.01, 'tfidf__use_idf': True, '...",0.608485,0.602247,0.605515,0.607049,0.601703,0.605,0.002648,2
2,5.158236,0.578517,0.681346,0.057018,0.01,False,"(1, 1)","{'clf__alpha': 0.01, 'tfidf__use_idf': False, ...",0.561705,0.557646,0.562348,0.557052,0.558685,0.559487,0.002148,8
3,16.946556,0.549225,1.895764,0.098344,0.01,False,"(1, 2)","{'clf__alpha': 0.01, 'tfidf__use_idf': False, ...",0.56319,0.558784,0.561358,0.560022,0.560566,0.560784,0.001466,7
4,4.989622,0.541587,0.730322,0.127181,0.001,True,"(1, 1)","{'clf__alpha': 0.001, 'tfidf__use_idf': True, ...",0.594278,0.591604,0.592842,0.595713,0.595119,0.593911,0.001504,3
5,18.666392,0.471575,2.14042,0.210836,0.001,True,"(1, 2)","{'clf__alpha': 0.001, 'tfidf__use_idf': True, ...",0.612742,0.604228,0.607643,0.609475,0.604772,0.607772,0.003136,1
6,4.951309,0.544204,0.695628,0.132648,0.001,False,"(1, 1)","{'clf__alpha': 0.001, 'tfidf__use_idf': False,...",0.5871,0.580565,0.584278,0.58606,0.585021,0.584605,0.002233,5
7,14.603353,0.605008,1.511245,0.150702,0.001,False,"(1, 2)","{'clf__alpha': 0.001, 'tfidf__use_idf': False,...",0.592297,0.586308,0.588486,0.591951,0.588733,0.589555,0.002264,4
