## - Imports required libraries/modules.

In [68]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


### Loads or reads dataset/model files.

In [4]:
df = pd.read_csv("data/twitter_training.csv")

In [5]:
df.head()


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [6]:
df.shape

(74681, 4)

In [7]:
df.columns

Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')

In [8]:
df.drop(["Borderlands","2401"],axis=1,inplace=True)

In [9]:
df["Positive"].value_counts()

Positive
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [10]:
df.head()

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,"
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [15]:
df.rename(columns={
    "im getting on borderlands and i will murder you all ,": "content"
}, inplace=True)

In [16]:
df.head()

Unnamed: 0,Positive,content
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [18]:
df.duplicated().value_counts()

False    69772
True      4909
Name: count, dtype: int64

In [19]:
df.drop_duplicates()

Unnamed: 0,Positive,content
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,Just realized that the Windows partition of my...
74677,Positive,Just realized that my Mac window partition is ...
74678,Positive,Just realized the windows partition of my Mac ...
74679,Positive,Just realized between the windows partition of...


In [20]:
df.isnull().sum()

Positive      0
content     686
dtype: int64

In [23]:
df = df.dropna(subset=['content'])


In [24]:
df.isnull().sum()

Positive    0
content     0
dtype: int64

In [65]:
!python -m spacy download en_core_web_sm


zsh:1: command not found: python


In [78]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def lemmatize_nltk(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yuvashankarnarayana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yuvashankarnarayana/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [80]:
df['content'] = df['content'].apply(lemmatize_nltk)


In [81]:
text_clf = Pipeline([
    
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(class_weight='balanced', max_iter=5000))
])


In [82]:
x = df["content"]
y = df["Positive"]



In [83]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [84]:
x_train.shape,x_test.shape

((59196,), (14799,))

In [85]:
y_train.shape,y_test.shape

((59196,), (14799,))

In [86]:
text_clf.fit(x_train,y_train)

In [87]:
y_pred = text_clf.predict(x_test)


In [88]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9402662342050139

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.95      0.92      0.94      2624
    Negative       0.96      0.94      0.95      4463
     Neutral       0.92      0.95      0.93      3589
    Positive       0.93      0.94      0.94      4123

    accuracy                           0.94     14799
   macro avg       0.94      0.94      0.94     14799
weighted avg       0.94      0.94      0.94     14799


Confusion Matrix:
 [[2406   52   87   79]
 [  39 4208  109  107]
 [  31   60 3413   85]
 [  44   85  106 3888]]


In [91]:
user_text = ["I love this game! It's amazing!"]
prediction = text_clf.predict(user_text)
print("Predicted class:", prediction[0])

Predicted class: Positive


In [94]:
import joblib

joblib.dump(text_clf,"text_clf.pkl")

['text_clf.pkl']