In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re, string, warnings, nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from google.colab import drive 
drive.mount('/content/gdrive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
data = pd.read_csv('gdrive/MyDrive/twitter_dataset.csv')
data

Unnamed: 0,index,id,Text,Annotation,oh_label
0,5.74948705591165E+017,5.74948705591165E+017,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0
1,5.71917888690393E+017,5.71917888690393E+017,@ShreyaBafna3 Now you idiots claim that people...,none,0.0
2,3.90255841338601E+017,3.90255841338601E+017,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0
3,5.68208850655916E+017,5.68208850655916E+017,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0
4,5.75596338802373E+017,5.75596338802373E+017,#mkr No No No No No No,none,0.0
...,...,...,...,...,...
16846,5.75606766236475E+017,5.75606766236475E+017,"Feeling so sorry for the girls, they should be...",none,0.0
16847,5.72333822886326E+017,5.72333822886326E+017,#MKR 'pretty good dishes we're happy with' - O...,none,0.0
16848,5.72326950057845E+017,5.72326950057845E+017,RT @colonelkickhead: Deconstructed lemon tart!...,none,0.0
16849,5.74799612642357E+017,5.74799612642357E+017,@versacezaynx @nyazpolitics @greenlinerzjm You...,none,0.0


In [None]:
wordnet_lemmatizer = WordNetLemmatizer()
def normalizer(tweet):
  only_letters = re.sub("[^a-zA-Z]", " ", str(tweet))
  only_letters = only_letters.lower()
  only_letters = only_letters.split()
  filtered_result = [word for word in only_letters if word not in stopwords.words('english')]
  lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
  lemmas = " ".join(lemmas)
  return lemmas

In [None]:
data = shuffle(data)
y = data['oh_label'].values.astype(np.int64)
x = data['Text'].apply(normalizer)

In [None]:
vectorizer = CountVectorizer()
x_vectorized = vectorizer.fit_transform(x)

In [None]:
x_vectorized

<16851x23273 sparse matrix of type '<class 'numpy.int64'>'
	with 154632 stored elements in Compressed Sparse Row format>

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_vectorized, y, test_size=.2)

In [None]:
#Suport Vector Machine Algorithm
svm = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm.fit(x_train, y_train)
pred_train = svm.predict(x_train)
pred_test = svm.predict(x_test)

In [None]:
#KNearestNeigbour Algorithm
knn = KNeighborsClassifier(3)
knn.fit(x_train, y_train)
pred_train_knn = knn.predict(x_train)
pred_test_knn = knn.predict(x_test)

In [None]:
#Logistics Regression Algorithm
logreg = LogisticRegression(multi_class='multinomial', solver='newton-cg')
logreg.fit(x_train, y_train)
pred_train_logreg = logreg.predict(x_train)
pred_test_logreg = logreg.predict(x_test)

In [None]:
#Stacking Model
estimators = [('svm', svm), ('knn', knn), ('logreg',logreg)]
stack_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stack_model.fit(x_train, y_train)
pred_train_stack = stack_model.predict(x_train)
pred_test_stack = stack_model.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# **Model Evaluation**

In [None]:
print(f"SVM Accuracy Score for Train Data: {accuracy_score(y_train, pred_train)*100:.2f}")
print(f"SVM Accuracy Score for Test Data: {accuracy_score(y_test, pred_test)*100:.2f}")

SVM Accuracy Score for Train Data: 98.37
SVM Accuracy Score for Test Data: 84.01


In [None]:
print(f"KNN Accuracy Score for Train Data: {accuracy_score(y_train, pred_train_knn)*100:.2f}")
print(f"KNN Accuracy Score for Test Data: {accuracy_score(y_test, pred_test_knn)*100:.2f}")

KNN Accuracy Score for Train Data: 80.15
KNN Accuracy Score for Test Data: 72.44


In [None]:
print(f"Logistic Reg Accuracy Score for Train Data: {accuracy_score(y_train, pred_train_logreg)*100:.2f}")
print(f"Logistic Reg Accuracy Score for Test Data: {accuracy_score(y_test, pred_test_logreg)*100:.2f}")

Logistic Reg Accuracy Score for Train Data: 97.37
Logistic Reg Accuracy Score for Test Data: 85.26


In [None]:
print(f"Stacking Accuracy Score for Train Data: {accuracy_score(y_train, pred_train_stack)*100:.2f}")
print(f"Stacking Accuracy Score for Test Data: {accuracy_score(y_test, pred_test_stack)*100:.2f}")

Stacking Accuracy Score for Train Data: 97.72
Stacking Accuracy Score for Test Data: 85.52


In [None]:
print(f"F1 Score for Train Data: {f1_score(y_train, pred_train_stack, average='micro')*100:.2f}")
print(f"F1 Score for Test Data: {f1_score(y_test, pred_test_stack, average='micro')*100:.2f}")

F1 Score for Train Data: 97.72
F1 Score for Test Data: 85.52


In [None]:
test_feature = vectorizer.transform(["yeah so annoying how they're doing their jobs @Dean_Carr Call me sexist but woman football commentators annoy me so much, they never shut up"])
stack_model.predict(test_feature)

array([1])

In [None]:
import pickle

In [None]:
files = {'vectorizer': vectorizer, 'model': stack_model}
pickle.dump(files, open('models.pkl', "wb"))

In [None]:
load_model = pickle.load(open('models.pkl', "rb"))

In [None]:
load_model['model'].predict(test_feature)

array([1])