In [219]:
import pandas as pd
import os
import csv
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

In [220]:
df = pd.read_csv('../Dataset/data.csv', delimiter="@@@", skiprows=2, encoding='utf-8', engine='python')

In [221]:
okgo = pd.read_csv('../Dataset/OKGO.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') 
trump = pd.read_csv('../Dataset/trump.csv', delimiter=",", skiprows=2, encoding='utf-8', engine='python')
swift = pd.read_csv('../Dataset/TaylorSwift.csv', delimiter=",", skiprows=2, nrows=180, encoding='utf-8', engine='python')
royal = pd.read_csv('../Dataset/RoyalWedding.csv', delimiter=",", skiprows=2, nrows=61, encoding='utf-8', engine='python')
paul = pd.read_csv('../Dataset/LoganPaul.csv', delimiter=",", skiprows=2, nrows=200, encoding='utf-8', engine='python')
blogs = pd.read_csv('../Dataset/Kaggle.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') 
tweets = pd.read_csv('../Dataset/twitter.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') 

In [222]:
tweets = tweets.drop(['Topic', 'TweetId', "TweetDate"], axis = 1).dropna()

In [223]:
def fix_columns(data_frame):
    data_frame = data_frame.iloc[:, :2]
    data_frame.columns = ['label', 'comment']
    return data_frame

In [224]:
okgo = fix_columns(okgo)
trump = fix_columns(trump)
swift = fix_columns(swift)
royal = fix_columns(royal)
paul = fix_columns(paul)
tweets = fix_columns(tweets)

In [225]:
tweets.label = tweets.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
tweets['label'] = pd.to_numeric(tweets['label'], errors='coerce')

In [226]:
tweets = fix_columns(tweets)
blogs = fix_columns(blogs)

In [227]:
videos = pd.concat([okgo, trump, swift, royal, paul], ignore_index=True)
data = videos.copy()
data = fix_columns(data)

In [228]:
full = pd.concat([videos, tweets, blogs], ignore_index=True)
full = fix_columns(full)

In [229]:
df.columns = ["comment", "label"]

In [230]:
def as_str(DF):
    DF["comment"]= DF["comment"].astype(str) 

In [231]:
as_str(df)
as_str(data)
as_str(full)

In [232]:
def cleaner_fn(b):
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)
cleaner_fn(df)
cleaner_fn(data)
cleaner_fn(full)

### NLP

In [233]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [234]:
sw = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

### Remove Stop Words, Lemmatization, Stemming

In [235]:
def nlp_function(DF):
    DF['com_token'] = DF['comment'].str.lower().str.split()
    DF['com_remv'] = DF['com_token'].apply(lambda x: [y for y in x if y not in sw])
    DF["com_lemma"] = DF['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    DF['com_stem'] = DF['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    DF["com_tok_str"] = DF["com_stem"].apply(', '.join)
    DF["com_full"] = DF["com_stem"].apply(' '.join)
    #DF["com_tagged"] = DF['comment'].apply(lambda x : [nltk.pos_tag(y) for y in x]) #word tagging
    return DF
df = nlp_function(df)
data = nlp_function(data)
full= nlp_function(full)

In [236]:
data = data.dropna()
full = full.dropna()

### Vectorization

In [237]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

In [238]:
def word_to_vectors(vectorizer,  x_train, x_test):
    x_train = vectorizer.fit_transform(x_train)
    x_test = vectorizer.transform(x_test)
    return (x_train, x_test)    

In [239]:
count_vectorizer_data = CountVectorizer()
count_vectorizer_full = CountVectorizer()

In [240]:
x_train_data, x_test_data, y_train_data, y_test_data = train_test_split(data['com_full'], data['label'], test_size=0.2, shuffle=True)
x_train_data, x_test_data = word_to_vectors(count_vectorizer_data, x_train_data, x_test_data)

In [241]:
x_train_full, x_test_full, y_train_full, y_test_full = train_test_split(full['com_full'], full['label'], test_size=0.2, shuffle=True)
x_train_full, x_test_full = word_to_vectors(count_vectorizer_full, x_train_full, x_test_full)

In [242]:
### Model Building

In [243]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

###  Logistic Regression

In [244]:
lr_data = LogisticRegression(solver='sag', max_iter=100, random_state=42, multi_class="multinomial") 
lr_full = LogisticRegression(solver='sag', max_iter=100, random_state=42, multi_class="multinomial") 

In [245]:
lr_data.fit(x_train_data, y_train_data)
accuracy_score(y_test_data, lr_data.predict(x_test_data))

0.6394686907020873

In [246]:
lr_full.fit(x_train_full, y_train_full)
accuracy_score(y_test_full, lr_full.predict(x_test_full))

0.8672059319177621

### SVM

In [247]:
svm_data = SVC()
svm_full = SVC()

In [248]:
svm_data.fit(x_train_data, y_train_data)
accuracy_score(y_test_data, svm_data.predict(x_test_data))

0.6660341555977229

In [249]:
svm_full.fit(x_train_full, y_train_full)
accuracy_score(y_test_full, svm_full.predict(x_test_full))

0.8597910347152006

### KNN

In [135]:
knn_data = KNeighborsClassifier()
knn_full = KNeighborsClassifier()

In [136]:
knn_data.fit(x_train_data, y_train_data)
accuracy_score(y_test_data, knn_data.predict(x_test_data))

0.5635673624288425

In [187]:
knn_full.fit(x_train_full, y_train_full)
accuracy_score(y_test_full, knn_full.predict(x_test_full))

0.8378833838894506

### Random Forest Classifier

In [188]:
random_forest_classifier_data = RandomForestClassifier(n_estimators=10, random_state=10)
random_forest_classifier_full = RandomForestClassifier(n_estimators=10, random_state=10)

In [189]:
random_forest_classifier_data.fit(x_train_data, y_train_data)
accuracy_score(y_test_data, random_forest_classifier_data.predict(x_test_data))

0.5920303605313093

In [190]:
random_forest_classifier_full.fit(x_train_full, y_train_full)
accuracy_score(y_test_full, random_forest_classifier_full.predict(x_test_full))

0.847994607347489

###  Extreme Gradient Boosting

In [191]:
xgb_data = XGBClassifier()
xgb_full = XGBClassifier()

In [192]:
xgb_data.fit(x_train_data, y_train_data)
accuracy_score(y_test_data, xgb_data.predict(x_test_data))

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [-1.  0.  1.]

### Save the models

In [None]:
import joblib

In [206]:
joblib.dump(lr_data, 'lr_data.pkl')
joblib.dump(lr_full, 'lr_data_full.pkl')


['lr_data_full.pkl']

In [None]:
data.to_csv('data.csv')
full.to_csv('full.csv')

In [193]:
svm_full.predict()

TypeError: BaseSVC.predict() missing 1 required positional argument: 'X'

In [194]:
joblib.dump(count_vectorizer_full, 'count_vectorizer_full.pkl')
joblib.dump(count_vectorizer_data, 'count_vectorizer_data.pkl')

['count_vectorizer_data.pkl']

### Testing

In [195]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [196]:
data = pd.read_csv('data.csv')

In [197]:
data.dropna()

Unnamed: 0.1,Unnamed: 0,label,comment,com_token,com_remv,com_lemma,com_stem,com_tok_str,com_full
0,0,-1.0,Everyone knows brand s papers from But No on...,"['everyone', 'knows', 'brand', 's', 'papers', ...","['everyone', 'knows', 'brand', 'papers', 'one'...","['everyone', 'know', 'brand', 'paper', 'one', ...","['everyon', 'know', 'brand', 'paper', 'one', '...","everyon, know, brand, paper, one, know, welfar...",everyon know brand paper one know welfar emplo...
1,1,0.0,Your paper cut balance is,"['your', 'paper', 'cut', 'balance', 'is']","['paper', 'cut', 'balance']","['paper', 'cut', 'balance']","['paper', 'cut', 'balanc']","paper, cut, balanc",paper cut balanc
2,2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...,"['oh', 'shit', 'when', 'i', 'saw', 'this', 'on...","['oh', 'shit', 'saw', 'front', 'page', 'love',...","['oh', 'shit', 'saw', 'front', 'page', 'love',...","['oh', 'shit', 'saw', 'front', 'page', 'love',...","oh, shit, saw, front, page, love, song",oh shit saw front page love song
3,3,1.0,Blowing my mind yet again,"['blowing', 'my', 'mind', 'yet', 'again']","['blowing', 'mind', 'yet']","['blowing', 'mind', 'yet']","['blow', 'mind', 'yet']","blow, mind, yet",blow mind yet
4,4,0.0,Should have gone with Dunder Mifflin,"['should', 'have', 'gone', 'with', 'dunder', '...","['gone', 'dunder', 'mifflin']","['gone', 'dunder', 'mifflin']","['gone', 'dunder', 'mifflin']","gone, dunder, mifflin",gone dunder mifflin
...,...,...,...,...,...,...,...,...,...
2628,4983,-1.0,He makes Americans look bad,"['he', 'makes', 'americans', 'look', 'bad']","['makes', 'americans', 'look', 'bad']","['make', 'american', 'look', 'bad']","['make', 'american', 'look', 'bad']","make, american, look, bad",make american look bad
2629,4984,-1.0,OMG no stop Japan my fav STOPP,"['omg', 'no', 'stop', 'japan', 'my', 'fav', 's...","['omg', 'stop', 'japan', 'fav', 'stopp']","['omg', 'stop', 'japan', 'fav', 'stopp']","['omg', 'stop', 'japan', 'fav', 'stopp']","omg, stop, japan, fav, stopp",omg stop japan fav stopp
2630,4985,-1.0,This guy is making the U S look bad,"['this', 'guy', 'is', 'making', 'the', 'u', 's...","['guy', 'making', 'u', 'look', 'bad']","['guy', 'making', 'u', 'look', 'bad']","['guy', 'make', 'u', 'look', 'bad']","guy, make, u, look, bad",guy make u look bad
2631,4986,-1.0,I thought Logan Paul was nicer than jake Paul ...,"['i', 'thought', 'logan', 'paul', 'was', 'nice...","['thought', 'logan', 'paul', 'nicer', 'jake', ...","['thought', 'logan', 'paul', 'nicer', 'jake', ...","['thought', 'logan', 'paul', 'nicer', 'jake', ...","thought, logan, paul, nicer, jake, paul, wrong",thought logan paul nicer jake paul wrong


In [198]:
cv = CountVectorizer()
x_train_data = CountVectorizer.fit_transform(pd.DataFrame(data['com_full']))

AttributeError: 'DataFrame' object has no attribute '_validate_params'

In [199]:
svm_data.predict(count_vectorizer_data.transform(['Bad video hate ']))[0]

0.0

In [202]:
cvf = joblib.load('count_vectorizer_full.pkl')

In [203]:
lr_full.predict(cvf.transform(['Good video']))

array([1.])

In [207]:
import joblib
import regex as re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

sw = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

#  Load the model
model = joblib.load('svm_full.pkl')
vectorizer = joblib.load('count_vectorizer_full.pkl')

def clean(s):
    s = re.sub("[^a-zA-Z]", " ", s)
    words = s.split()
    rem_sw = [word for word in words if word not in sw]
    lemm_words = [lemmatizer.lemmatize(word) for word in rem_sw]
    return ' '.join(lemm_words)


def compute_results(comments):
    positive, negative, neutral = (0, 0, 0)
    for comment in comments:
        comment = clean(comment)
        output = model.predict(vectorizer.transform([comment]))[0]
        if output == 0:
            neutral += 1
        elif output == 1:
            positive += 1
        else:
            negative += 1
    return (positive, negative, neutral)
print(compute_results(['Good Video']))
    
    

ValueError: X has 13493 features, but SVC is expecting 13536 features as input.

In [215]:
joblib.dump(svm_full, 'svm_full.pkl')
model = joblib.load('svm_full.pkl')
# svm_full.predict(vectorizer.transform(['Good Video']))

In [218]:
model.predict(vectorizer.transform(['Good Video']))

array([1.])

In [250]:
joblib.dump(svm_full, 'saved_svm_full.pkl')
joblib.dump(count_vectorizer_full, 'saved_count_vectorizer_full.pkl')


['saved_count_vectorizer_full.pkl']

In [253]:
model = joblib.load('saved_svm_full.pkl')
vectorizer = joblib.load('saved_count_vectorizer_full.pkl')

In [254]:
model.predict(vectorizer.transform(['Good Video']))

array([1.])