In [1]:
import pandas as pd
from sklearn import metrics

In [2]:
dataset = pd.read_csv('Tweets.csv')
X = dataset["text"]
y = dataset["airline_sentiment"]

In [3]:
#cleaning the cells
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 6918):
    review = re.sub('[^a-zA-Z]', ' ', X[i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zaoudre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
### Creating the Bag of Words model

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = .85, max_features = 1500)
X = cv.fit_transform(X).toarray()

In [6]:
### Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [8]:
### Fitting Multinomial Naive Bayes to the Training set

from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)
print('\nNaive Bayes')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Naive Bayes
Accuracy Score: 77.93715846994536%
Confusion Matrix: 
[[1612  188   70]
 [ 202  362   50]
 [  87   49  308]]


In [11]:
### K-fold cross validation for Naive Bayes

from sklearn.model_selection import cross_val_score
accuracies_NB = cross_val_score(estimator = NB, X = X_train, y = y_train, cv = 10)
mean_NB = accuracies_NB.mean()
std_NB = accuracies_NB.std()

In [12]:
### Fitting SVM classifier to the Training set

from sklearn.svm import LinearSVC
SVM = LinearSVC()
SVM.fit(X_train, y_train)
y_pred = SVM.predict(X_test)
print('\nSupport Vector Machine')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Support Vector Machine
Accuracy Score: 78.72267759562843%
Confusion Matrix: 
[[1616  183   71]
 [ 182  378   54]
 [  73   60  311]]


In [16]:
### Fitting Linear Regression model to the Training set

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print('\nLogistic Regression')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')





Logistic Regression
Accuracy Score: 79.61065573770492%
Confusion Matrix: 
[[1659  156   55]
 [ 190  374   50]
 [  79   67  298]]


In [19]:
###Fitting K Nearest Neighbor classifier to the Training set

from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 3)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print('\nK Nearest Neighbors')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')



K Nearest Neighbors
Accuracy Score: 51.775956284153004%
Confusion Matrix: 
[[927 714 229]
 [149 375  90]
 [ 90 140 214]]


In [20]:
### Fitting Decision Tree Classifier to the Training set

from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)
print('\nDecision Tree')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Decision Tree
Accuracy Score: 67.99863387978142%
Confusion Matrix: 
[[1455  263  152]
 [ 238  299   77]
 [ 115   92  237]]


In [21]:
### Fitting Random Forest Classifier to the Training set

from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 15, criterion = 'entropy', random_state = 0)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)
print('\nRandom Forest')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Random Forest
Accuracy Score: 74.62431693989072%
Confusion Matrix: 
[[1722  116   32]
 [ 322  264   28]
 [ 170   75  199]]


In [22]:
### Analysing the model

token_words = cv.get_feature_names()
print('\n Analysis')
print('Number of tokens: ',len(token_words))
counts = NB.feature_count_
df_table = {'Token':token_words,'Negative': counts[0,:],'Positive': counts[1,:]}
tokens = pd.DataFrame(df_table, columns= ['Token','Positive','Negative'])
positives = len(tokens[tokens['Positive']>tokens['Negative']])
print('No. of positive tokens: ',positives)
print('No. of negative tokens: ',len(token_words)-positives)


 Analysis
Number of tokens:  1500
No. of positive tokens:  78
No. of negative tokens:  1422


In [23]:
### Check positivity/negativity of specific tokens

token_search = ['awesome']
print('\nSearch Results for token/s:',token_search)
print(tokens.loc[tokens['Token'].isin(token_search)])


Search Results for token/s: ['awesome']
       Token  Positive  Negative
162  awesome       5.0      12.0


In [24]:
### Analyse False Negatives (Actual: 1; Predicted: 0)(Predicted negative review for a positive review)

print(X_test[ y_pred < y_test ])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [25]:
### Analyse False Positives (Actual: 0; Predicted: 1)(Predicted positive review for a negative review)

print(X_test[ y_pred > y_test ])

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
