In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Import packages
# Basics
import pandas as pd;
import os
import csv;
import numpy as np
import re;
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Reading Pre-Labeled YouTube Video Comments
# training data
vDataset1 = pd.read_csv('/content/drive/MyDrive/Project_BCA/Dataset/video1.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data
vDataset2 = pd.read_csv('/content/drive/MyDrive/Project_BCA/Dataset/video2.csv', delimiter=",", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python')
vDataset3 = pd.read_csv('/content/drive/MyDrive/Project_BCA/Dataset/video3.csv', delimiter=",", skiprows=2, nrows=180, encoding='utf-8', engine='python')
vDataset4 = pd.read_csv('/content/drive/MyDrive/Project_BCA/Dataset/video4.csv', delimiter=",", skiprows=2, nrows=61, encoding='utf-8', engine='python')
vDataset5 = pd.read_csv('/content/drive/MyDrive/Project_BCA/Dataset/video5.csv', delimiter=",", skiprows=2, nrows=200, encoding='utf-8', engine='python')

In [4]:
#Reading Pre-Labeled Facebook & Instagram Comments
instagram = pd.read_csv('/content/drive/MyDrive/Project_BCA/Dataset/instagram.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data
facebook = pd.read_csv('/content/drive/MyDrive/Project_BCA/Dataset/facebook.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data

In [5]:
#Data Preprocessing
# clean dataframes
facebook = facebook.drop(['Topic', 'FacebookId', "Date"], axis = 1).dropna()
facebook.head()

Unnamed: 0,Sentiment,FbText
0,positive,Now all @Apple has to do is get swype on the i...
1,positive,@Apple will be adding more carrier support to ...
2,positive,Hilarious @youtube video - guy does a duet wit...
3,positive,@RIM you made it too easy for me to switch to ...
4,positive,I just realized that the reason I got into twi...


In [6]:
#Data Preprocessing
def fix_cols(DF):
    DF = DF.iloc[:,:2]
    DF.columns = ["label", "comment"]
    return DF

In [7]:
vDataset1 = fix_cols(vDataset1)
vDataset2 = fix_cols(vDataset2)
vDataset3 = fix_cols(vDataset3)
vDataset4 = fix_cols(vDataset4)
vDataset5 = fix_cols(vDataset5)
facebook = fix_cols(facebook)

vDataset1.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand's papers from.\rBut -No o...
1,0.0,ÒYour paper cut balance is: \r-£25279102771Ó
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE.........
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


In [8]:
facebook.label = facebook.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
facebook['label'] = pd.to_numeric(facebook['label'], errors='coerce')

In [9]:
facebook = fix_cols(facebook)
instagram = fix_cols(instagram)

facebook.head()

Unnamed: 0,label,comment
0,1.0,Now all @Apple has to do is get swype on the i...
1,1.0,@Apple will be adding more carrier support to ...
2,1.0,Hilarious @youtube video - guy does a duet wit...
3,1.0,@RIM you made it too easy for me to switch to ...
4,1.0,I just realized that the reason I got into twi...


In [10]:
#Create Datasets
comments = pd.concat([vDataset1, vDataset2, vDataset3, vDataset4, vDataset5,instagram, facebook], ignore_index=True)
comments.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand's papers from.\rBut -No o...
1,0.0,ÒYour paper cut balance is: \r-£25279102771Ó
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE.........
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


In [11]:
#Remove Non-Alphabetic Characters (including numbers)

def convert_to_string(DF):
    DF["comment"]= DF["comment"].astype(str)

In [12]:
convert_to_string(comments)

In [13]:
def cleanerFn(b):
    # keeps only words with alphabetic characters in comments
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)

In [14]:
cleanerFn(comments)
comments.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand s papers from But No on...
1,0.0,Your paper cut balance is
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


In [15]:
#Natural Language Processing

import nltk
nltk.download('all')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize,sent_tokenize

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

In [16]:
sw = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [17]:
#Tokenization, Remove Stop Words, Lemmatization & Stemming
def nlpFunction(DF):
    DF['com_token'] = DF['comment'].str.lower().str.split()
    DF['com_remv'] = DF['com_token'].apply(lambda x: [y for y in x if y not in sw])
    DF["com_lemma"] = DF['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    DF['com_stem'] = DF['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    DF["com_tok_str"] = DF["com_stem"].apply(', '.join)
    DF["com_full"] = DF["com_remv"].apply(' '.join)
    return DF

In [18]:
comments = nlpFunction(comments)
comments.head()

Unnamed: 0,label,comment,com_token,com_remv,com_lemma,com_stem,com_tok_str,com_full
0,-1.0,Everyone knows brand s papers from But No on...,"[everyone, knows, brand, s, papers, from, but,...","[everyone, knows, brand, papers, one, knows, w...","[everyone, know, brand, paper, one, know, welf...","[everyon, know, brand, paper, one, know, welfa...","everyon, know, brand, paper, one, know, welfar...",everyone knows brand papers one knows welfare ...
1,0.0,Your paper cut balance is,"[your, paper, cut, balance, is]","[paper, cut, balance]","[paper, cut, balance]","[paper, cut, balanc]","paper, cut, balanc",paper cut balance
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...,"[oh, shit, when, i, saw, this, on, my, front, ...","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","oh, shit, saw, front, page, love, song",oh shit saw front page love song
3,1.0,Blowing my mind yet again,"[blowing, my, mind, yet, again]","[blowing, mind, yet]","[blowing, mind, yet]","[blow, mind, yet]","blow, mind, yet",blowing mind yet
4,0.0,Should have gone with Dunder Mifflin,"[should, have, gone, with, dunder, mifflin]","[gone, dunder, mifflin]","[gone, dunder, mifflin]","[gone, dunder, mifflin]","gone, dunder, mifflin",gone dunder mifflin


In [19]:
def drop_cols_after_nlp(comments):
    comments = comments.drop(columns = ['comment', 'com_token', 'com_remv', 'com_lemma', 'com_stem', 'com_tok_str'], axis = 1)
    return comments
comments = drop_cols_after_nlp(comments)
comments.head()

Unnamed: 0,label,com_full
0,-1.0,everyone knows brand papers one knows welfare ...
1,0.0,paper cut balance
2,1.0,oh shit saw front page love song
3,1.0,blowing mind yet
4,0.0,gone dunder mifflin


In [20]:
comments.rename(columns = {'com_full': 'comment'}, inplace=True)
comments.head()

Unnamed: 0,label,comment
0,-1.0,everyone knows brand papers one knows welfare ...
1,0.0,paper cut balance
2,1.0,oh shit saw front page love song
3,1.0,blowing mind yet
4,0.0,gone dunder mifflin


In [21]:
def remove_missing_vals(comments):
    comments['comment'] = comments['comment'].str.strip()
    comments = comments[comments.comment != 'nan'] # remove nan values from data
    comments = comments[comments.comment != '']

remove_missing_vals(comments)

In [22]:
comments.head()

Unnamed: 0,label,comment
0,-1.0,everyone knows brand papers one knows welfare ...
1,0.0,paper cut balance
2,1.0,oh shit saw front page love song
3,1.0,blowing mind yet
4,0.0,gone dunder mifflin


In [23]:
comments['label'].isna().sum()

2355

In [24]:
comments = comments[comments['label'].notna()]
comments['label'].isna().sum()

0

In [25]:
len(comments)

14830

In [26]:
X = comments['comment']
y = comments.label

In [27]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=53, test_size=0.25)

In [28]:
# Initialize count vectorizer
count_vectorizer = CountVectorizer(stop_words='english',
                                   min_df=0.05, max_df=0.9)

# Create count train and test variables
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

# Initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                   min_df=0.05, max_df=0.9)

# Create tfidf train and test variables
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [29]:
#Model Building
# Set seed for reproducibility
import random; random.seed(5)

# Import all we need from sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics

In [30]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(tfidf_train,y_train)
accuracy_lr = lr_model.score(tfidf_test,y_test)
print("Logistic Regression accuracy is (for Tfidf) : %0.3f" % accuracy_lr)

Logistic Regression accuracy is (for Tfidf) : 0.788


In [31]:
# Create a SVM model
from sklearn import svm
tfidf_svc = svm.SVC(kernel='linear', C=1)

tfidf_svc.fit(tfidf_train,y_train)
# Run predict on your tfidf test data to get your predictions
tfidf_svc_pred = tfidf_svc.predict(tfidf_test)

# Calculate your accuracy using the metrics module
tfidf_svc_score = metrics.accuracy_score(y_test,tfidf_svc_pred)

print("LinearSVC Score (for tfidf):   %0.3f" % tfidf_svc_score)

LinearSVC Score (for tfidf):   0.792


In [32]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model_initial = RandomForestClassifier(n_estimators = 5, random_state = 1)
rf_model_initial.fit(tfidf_train,y_train)
print("Random Forest accuracy for 5 trees is (Tfidf): %0.3f" % rf_model_initial.score(tfidf_test,y_test))

Random Forest accuracy for 5 trees is (Tfidf): 0.798


In [33]:
#Predicting Sentiment For YouTube video
prediction_comments = pd.read_csv('/content/drive/MyDrive/Project_BCA/Dataset/Comments.csv', delimiter=",", encoding='utf-8', engine='python')
prediction_comments = prediction_comments.iloc[:,:1]
prediction_comments.columns=['comment']
prediction_comments.head()

Unnamed: 0,comment
0,What do YOU think to the current state of Fold...
1,"Well, finally someone who can compete with Sam..."
2,I wanna see them attempt something like the Z-...
3,"4:57 ""And then actually coming with the charge..."
4,"Personally, for me this was one of, if not the..."


In [34]:
# Lets use SVC to predict on our youtube video comments
prediction_comments.head()

Unnamed: 0,comment
0,What do YOU think to the current state of Fold...
1,"Well, finally someone who can compete with Sam..."
2,I wanna see them attempt something like the Z-...
3,"4:57 ""And then actually coming with the charge..."
4,"Personally, for me this was one of, if not the..."


In [35]:
len(prediction_comments['comment'])

1001

In [36]:
convert_to_string(prediction_comments)
cleanerFn(prediction_comments)
prediction_comments = nlpFunction(prediction_comments)
prediction_comments = drop_cols_after_nlp(prediction_comments)
prediction_comments.rename(columns = {'com_full': 'comment'}, inplace=True)
remove_missing_vals(prediction_comments)
prediction_comments.head()

Unnamed: 0,comment
0,think current state foldable phones check tesl...
1,well finally someone compete samsung market co...
2,wanna see attempt something like z flip someth...
3,actually coming charger respect xiaomi getting...
4,personally one best video ever made simple alw...


In [37]:
tfidf_pred = tfidf_vectorizer.transform(prediction_comments['comment'])
tfidf_svc_pred = tfidf_svc.predict(tfidf_pred)

In [38]:
neutral = (tfidf_svc_pred == 0.0).sum()
positive = (tfidf_svc_pred == 1.0).sum()
negative = (tfidf_svc_pred < 0).sum()

In [39]:
print(neutral, positive, negative)

833 161 7


In [40]:
print("Good video" if positive > negative else "Bad video")

Good video
