# Imports

In [58]:
import pandas as pd
pd.options.mode.chained_assignment = None

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

### Setup Spacy NLP

In [59]:
nlp = en_core_web_sm.load()
spacy_tokenizer = nlp.tokenizer

### Load Dataset(s)

In [60]:
df = pd.read_csv('data/train.csv')

### Explore & Clean Data

In [61]:
df.head(25)

Unnamed: 0,2401,Borderlands,1,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,1,I am coming to the borders and I will kill you...
1,2401,Borderlands,1,im getting on borderlands and i will kill you ...
2,2401,Borderlands,1,im coming on borderlands and i will murder you...
3,2401,Borderlands,1,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,1,im getting into borderlands and i can murder y...
5,2402,Borderlands,1,So I spent a few hours making something for fu...
6,2402,Borderlands,1,So I spent a couple of hours doing something f...
7,2402,Borderlands,1,So I spent a few hours doing something for fun...
8,2402,Borderlands,1,So I spent a few hours making something for fu...
9,2402,Borderlands,1,2010 So I spent a few hours making something f...


In [62]:
df.columns = ['id', 'entity', 'sentiment', 'review']

In [63]:
train = df.drop(columns=['id', 'entity'])

In [64]:
train.head(25)

Unnamed: 0,sentiment,review
0,1,I am coming to the borders and I will kill you...
1,1,im getting on borderlands and i will kill you ...
2,1,im coming on borderlands and i will murder you...
3,1,im getting on borderlands 2 and i will murder ...
4,1,im getting into borderlands and i can murder y...
5,1,So I spent a few hours making something for fu...
6,1,So I spent a couple of hours doing something f...
7,1,So I spent a few hours doing something for fun...
8,1,So I spent a few hours making something for fu...
9,1,2010 So I spent a few hours making something f...


In [65]:
train.value_counts('sentiment')

sentiment
 0    31308
-1    22542
 1    20831
dtype: int64

In [66]:
train = train[train['sentiment'] != 0]

In [67]:
train.value_counts('sentiment')

sentiment
-1    22542
 1    20831
dtype: int64

In [68]:
train['sentiment'].mask(train['sentiment'] == -1, 0, inplace=True)

In [69]:
train.value_counts('sentiment')

sentiment
0    22542
1    20831
dtype: int64

In [70]:
train.isna().sum()

sentiment      0
review       361
dtype: int64

In [71]:
train.dropna(axis=0, inplace=True)

In [72]:
train.isna().sum()

sentiment    0
review       0
dtype: int64

In [73]:
stopwords = list(STOP_WORDS)
stopwords

['because',
 'out',
 'her',
 "'s",
 'does',
 'even',
 'will',
 'been',
 'any',
 'name',
 'same',
 "'d",
 'n’t',
 'by',
 'seeming',
 'unless',
 'them',
 'nowhere',
 'when',
 'towards',
 'was',
 'to',
 "'re",
 'your',
 'former',
 'who',
 'between',
 'hereby',
 'seem',
 'those',
 'part',
 'someone',
 'myself',
 'however',
 'ten',
 'have',
 'made',
 'other',
 'my',
 'across',
 'amongst',
 'empty',
 'yours',
 'after',
 '‘s',
 'besides',
 'much',
 'one',
 'for',
 'further',
 'top',
 'it',
 '‘d',
 '’m',
 'again',
 'last',
 'few',
 'anyone',
 'before',
 'thru',
 'many',
 'front',
 'others',
 'regarding',
 'during',
 '‘ve',
 'there',
 'hereupon',
 'two',
 'nevertheless',
 'whole',
 'sometimes',
 'had',
 'elsewhere',
 'should',
 'we',
 'us',
 'into',
 'least',
 'if',
 'none',
 'might',
 'hence',
 'though',
 'a',
 'above',
 'still',
 'each',
 'thereafter',
 'you',
 'formerly',
 'say',
 'thence',
 'nor',
 'perhaps',
 'else',
 'whoever',
 'only',
 'without',
 'whither',
 'do',
 'often',
 'whereupon

In [74]:
def clean_text(s):
    split_text = s.split()
    cleaned_text = [word for word in split_text if word not in stopwords]
    final_text = ' '.join(cleaned_text)
    return final_text

In [75]:
text = "So I spent a couple of hours doing something for fun... If you don't know that I'm a huge @ Borderlands fan and Maya is one of my favorite characters, I decided to make a wallpaper for my PC.. Her..."

clean_text(text)

"So I spent couple hours fun... If don't know I'm huge @ Borderlands fan Maya favorite characters, I decided wallpaper PC.. Her..."

In [76]:
train['review'] = train['review'].apply(clean_text)

In [77]:
train.head(25)

Unnamed: 0,sentiment,review
0,1,"I coming borders I kill all,"
1,1,"im getting borderlands kill all,"
2,1,"im coming borderlands murder all,"
3,1,"im getting borderlands 2 murder all,"
4,1,"im getting borderlands murder all,"
5,1,So I spent hours making fun. . . If don't know...
6,1,So I spent couple hours fun... If don't know I...
7,1,So I spent hours fun... If don't know I'm HUGE...
8,1,So I spent hours making fun. . . If don't know...
9,1,2010 So I spent hours making fun. . . If don't...


### Create Data Processing Pipeline

In [78]:
def cleaner(df_path):
    initial_df = pd.read_csv(df_path)
    initial_df.columns = ['id', 'entity', 'sentiment', 'review']
    initial_df.drop(columns=['id', 'entity'])
    prepped_df = initial_df[initial_df['sentiment'] != 0]
    prepped_df['sentiment'].mask(prepped_df['sentiment'] == -1, 0, inplace=True)
    prepped_df['review'] = prepped_df['review'].apply(clean_text)
    return prepped_df

#### Utilize data processing to prep test data

In [79]:
test = cleaner('data/test.csv')

In [80]:
test.head(25)

Unnamed: 0,id,entity,sentiment,review
1,8312,Microsoft,0,@Microsoft Why I pay WORD functions poorly @Sa...
2,4371,CS-GO,0,"CSGO matchmaking closet hacking, it's truly aw..."
4,6273,FIFA,0,Hi @EAHelp I’ve Madeleine McCann cellar past 1...
5,7925,MaddenNFL,1,Thank @EAMaddenNFL!! New TE Austin Hooper ORAN...
6,11332,TomClancysRainbowSix,1,"Rocket League, Sea Thieves Rainbow Six: Siege🤔..."
7,1107,AssassinsCreed,1,ass knee-deep Assassins Creed Odyssey way anyt...
8,2069,CallOfDuty,0,FIX IT JESUS ! Please FIX IT ! What In world g...
9,3185,Dota2,1,The professional dota 2 scene fucking explodin...
10,1172,AssassinsCreed,1,Itching assassinate #TCCGif #AssassinsCreedBla...
11,11783,Verizon,0,"@FredTJoseph hey fred, Comcast cut cable Veriz..."


### Setup sklearn imports

In [92]:
X_train = train['review']
y_train = train['sentiment']

X_test = test['review']
y_test = test['sentiment']

In [82]:
pipe = Pipeline([('vectorizer', TfidfVectorizer()),
                 ('classifier', LinearSVC())])

In [83]:
pipe_model = pipe.fit(X_train, y_train)

In [98]:
predictions = pipe_model.predict(X_test)
accuracy_score(y_test, predictions)

0.9815837937384899

In [99]:
pipe2 = Pipeline([('vectorizer', TfidfVectorizer()),
                 ('classifier', MultinomialNB())])

In [100]:
pipe2_model = pipe2.fit(X_train, y_train)

In [101]:
predictions2 = pipe2_model.predict(X_test)
accuracy_score(y_test, predictions2)

0.9392265193370166

In [88]:
pipe3 = Pipeline([('vectorizer', TfidfVectorizer()),
                 ('classifier', SGDClassifier(loss='hinge', penalty='l2',
                                              alpha=1e-3, random_state=42,
                                              max_iter=5, tol=None))])

In [89]:
pipe3_model = pipe3.fit(X_train, y_train)

In [102]:
predictions3 = pipe3_model.predict(X_test)
accuracy_score(y_test, predictions3)

0.8379373848987108

# Pipe Performance

Pipe #1 Accuracy: 0.9815837937384899
Pipe #2 Accuracy: 0.9392265193370166
Pipe #3 Accuracy: 0.8379373848987108

## Import joblib and save Pipe #1

In [109]:
import joblib

In [112]:
joblib.dump(pipe, 'sk_sentiment.joblib')

['sk_sentiment.joblib']