In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [35]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [36]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report

In [37]:
df_train = pd.read_csv("SemEval2018-T3-train-taskA_emoji.csv",delimiter=',')

In [38]:
df_train

Unnamed: 0.1,Unnamed: 0,Tweet index,Label,Tweet text
0,0,1,1,Sweet United Nations video. Just in time for C...
1,1,2,1,@mrdahl87 We are rumored to have talked to Erv...
2,2,3,1,Hey there! Nice to see you Minnesota/ND Winter...
3,3,4,0,3 episodes left I'm dying over here
4,4,5,1,I can't breathe! was chosen as the most notabl...
...,...,...,...,...
3812,3812,3830,0,@banditelli regarding what the PSU president does
3813,3813,3831,0,@banditelli But still bothers me that I see no...
3814,3814,3832,0,well now that i've listened to all of into the...
3815,3815,3833,0,Hummingbirds #Are #Experts #at #Hovering #Aft...


In [39]:
lm = WordNetLemmatizer()

In [40]:
def text_transformation(df_col):
    corpus = []
    for item in df_col:
        new_item = re.sub('[^a-zA-Z]',' ',str(item))
        new_item = new_item.lower()
        new_item = new_item.split()
        new_item = [lm.lemmatize(word) for word in new_item if word not in set(stopwords.words('english'))]
        corpus.append(' '.join(str(x) for x in new_item))
    return corpus

In [41]:
corpus = text_transformation(df_train['Tweet text'])

In [42]:
cv = CountVectorizer(ngram_range=(1,2))
traindata = cv.fit_transform(corpus)
X = traindata

In [43]:
y = df_train['Label']

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [45]:
## Decision Tree

In [46]:
d_tree = DecisionTreeClassifier()
d_tree.fit(X,y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [47]:
test_corpus = text_transformation(X_test)

In [48]:
testdata = cv.transform(test_corpus)

In [49]:
predictions = d_tree.predict(testdata)

In [50]:
from sklearn.metrics import confusion_matrix, classification_report

In [51]:
print(confusion_matrix(y_test , predictions))
print('\n')
print(classification_report(y_test,predictions))

[[377   0]
 [387   0]]


              precision    recall  f1-score   support

           0       0.49      1.00      0.66       377
           1       0.00      0.00      0.00       387

    accuracy                           0.49       764
   macro avg       0.25      0.50      0.33       764
weighted avg       0.24      0.49      0.33       764



  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
## SVM

In [53]:
SVM_model = SVC(kernel = 'linear')
SVM_model.fit(X,y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [54]:
predictions2 = SVM_model.predict(testdata)

In [55]:
print(confusion_matrix(y_test , predictions2))
print('\n')
print(classification_report(y_test,predictions2))

[[377   0]
 [387   0]]


              precision    recall  f1-score   support

           0       0.49      1.00      0.66       377
           1       0.00      0.00      0.00       387

    accuracy                           0.49       764
   macro avg       0.25      0.50      0.33       764
weighted avg       0.24      0.49      0.33       764



  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
## Naive Bayes

In [57]:
NB = MultinomialNB()
NB.fit(X,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [58]:
predictions = NB.predict(testdata)

In [59]:
print(confusion_matrix(y_test , predictions))
print('\n')
print(classification_report(y_test,predictions))

[[377   0]
 [387   0]]


              precision    recall  f1-score   support

           0       0.49      1.00      0.66       377
           1       0.00      0.00      0.00       387

    accuracy                           0.49       764
   macro avg       0.25      0.50      0.33       764
weighted avg       0.24      0.49      0.33       764



  _warn_prf(average, modifier, msg_start, len(result))
