## ID : 816000325
## Name: Ajay Sieunarine
## Email: ajay.sieunarine@my.uwi.edu
## Repo: https://github.com/jefroy/big-data-A4

In [1]:
# math and graph stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# text analysis stuff
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from string import punctuation

# ML stuff
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score
from sklearn import svm
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

# utility stuff
from sklearn.utils import shuffle
from itertools import chain
from sklearn.preprocessing import LabelEncoder


sep = '============================================================================================'

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\idisc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\idisc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Warm Up (20 marks) 

In [2]:
# documents/sentences
corpus = [ 
    'Apple Orange Orange Apple',
    'Apple Banana Apple Banana',
    'Banana Apple Banana Banana Banana Apple',
    'Banana Orange Banana Banana Orange Banana',
    'Banana Apple Banana Banana Orange Banana'
]

def corp2vec(corpus):
    '''
    This function converts a document to a vector showing its term frequencies.
    Each column represents a term in the corpus. (in this case, there are 3 terms).
    Uses CountVectorizer() as a model.
    '''
    tf_vectorizer = CountVectorizer() # model
    tf = tf_vectorizer.fit_transform(corpus) # get term frequencies
    return tf.A # return the array representation

print(type(corp2vec(corpus)))
print(corp2vec(corpus))

<class 'numpy.ndarray'>
[[2 0 2]
 [2 2 0]
 [2 4 0]
 [0 4 2]
 [1 4 1]]


# Preprocessing and Data Organization (20 marks) 

In [3]:
df = pd.read_csv('MrTrumpSpeeches.csv', sep='~', encoding="ISO-8859-1")
df.head()

Unnamed: 0,id,playlist,upload_date,title,view_count,average_rating,like_count,dislike_count,subtitles
0,-2WTNSujhjk,Donald Trump Speeches & Events,20160220,Live Stream: Donald Trump Victory Rally in Spa...,4057.0,4.259259,44.0,10.0,presidents of the United States mr. go tr...
1,-64nfy6i58w,Donald Trump Speeches & Events,20161107,LAST RALLY: Donald Trump FINAL CAMPAIGN Rally ...,47276.0,4.358025,952.0,182.0,it's now officially Tuesday November a di...
2,-7Sp31hTxkU,Donald Trump Speeches & Events,20160423,"FULL SPEECH: Donald Trump Rally in Bridgeport,...",19966.0,4.666667,220.0,20.0,you [Music] [Music] [Music] you I...
3,-byuyavcNI4,Donald Trump Speeches & Events,20160617,"Full Speech: Donald Trump Rally in Houston, Te...",15138.0,4.582491,266.0,31.0,we welcome stars and president [Music] ...
4,09BXh-AA72M,Donald Trump Speeches & Events,20161105,"Full Speech: Donald Trump Rally in Denver, Col...",8720.0,4.924731,365.0,7.0,you thank you [Music] great people Gr...


In [4]:
df.dtypes

id                 object
playlist           object
upload_date         int64
title              object
view_count        float64
average_rating    float64
like_count        float64
dislike_count     float64
subtitles          object
dtype: object

## Task 1: 
Create a new column in the dataframe called 'sentiment'. Using appropriate existing columns, populate the new column with 0's and 1's where 0 refers to a negative sentiment and 1 refers to a positive sentiment. 

- using the `average_rating` column, find the average score
    * ratings above or equal to this average are denoted by sentiment = 1
    * ratings under this average are denoted by sentiment = 0

In [5]:
# find the avg/mid score possible:
max_score = max(df['average_rating'])
min_score = min(df['average_rating'])
avg_score = (max_score + min_score) / 2
print(max_score)
print(min_score)
print(avg_score)

5.0
1.34782612324
3.17391306162


In [6]:
def pruneRating(x):
    '''
    utility function for lambda
    returns 1 for good/avg rating
    returns 0 for bad rating
    '''
    if x >= avg_score:
        return 1
    if x < avg_score:
        return 0

df['sentiment'] = df['average_rating'].apply(
    lambda x: pruneRating(x)
)

enc3 = LabelEncoder()
enc3.fit(df['sentiment'])

print(enc3.classes_)
print(sep)
print(df['sentiment'].describe()) 
print(sep)
df.head()

[0 1]
count    836.000000
mean       0.903110
std        0.295985
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: sentiment, dtype: float64


Unnamed: 0,id,playlist,upload_date,title,view_count,average_rating,like_count,dislike_count,subtitles,sentiment
0,-2WTNSujhjk,Donald Trump Speeches & Events,20160220,Live Stream: Donald Trump Victory Rally in Spa...,4057.0,4.259259,44.0,10.0,presidents of the United States mr. go tr...,1
1,-64nfy6i58w,Donald Trump Speeches & Events,20161107,LAST RALLY: Donald Trump FINAL CAMPAIGN Rally ...,47276.0,4.358025,952.0,182.0,it's now officially Tuesday November a di...,1
2,-7Sp31hTxkU,Donald Trump Speeches & Events,20160423,"FULL SPEECH: Donald Trump Rally in Bridgeport,...",19966.0,4.666667,220.0,20.0,you [Music] [Music] [Music] you I...,1
3,-byuyavcNI4,Donald Trump Speeches & Events,20160617,"Full Speech: Donald Trump Rally in Houston, Te...",15138.0,4.582491,266.0,31.0,we welcome stars and president [Music] ...,1
4,09BXh-AA72M,Donald Trump Speeches & Events,20161105,"Full Speech: Donald Trump Rally in Denver, Col...",8720.0,4.924731,365.0,7.0,you thank you [Music] great people Gr...,1


## Task 2
- Clean the subtitles data and store the cleaned text in a new column 'subtitle_clean'.
    * For each step of your text cleaning give a brief explanation of why you chose to perform that method on the text. 

In [7]:
# this block of code will take a little while to process since the text is being thoroughly cleaned.
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation.
    2. Remove all stopwords.
    4. Convert all remaining words to lowercase.
    5. Lemmatize.
    6. Convert to the stem of the word.
    7. Returns a list of the cleaned text.
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    # convert all words to lowercase and lemmatize and stem
    wnl = WordNetLemmatizer()
    porter = PorterStemmer()
    arr = []
    for word in words:
        word = word.lower()
        word = porter.stem(word)
        word = wnl.lemmatize(word)
        word += ' '
        arr.append(word)
        
    arr = ''.join(arr) # convert the array of words/tokens into a string

    return arr

def test_text_process():
    '''
    function to hold some test cases to ensure the lambda function does its job
    '''
    print(
        'Test stopwords and lowercase:\nHI I AM A STOPWORD = ', 
        text_process("HI I AM A STOPWORD")
    )
    print(sep)
    print(
        'Test Stemming words and removing punctuation:\nfishing, fisher, fished = ', 
        text_process("fishing, fisher , fished")
    )

def pruneSubtitle(x):
    '''
    utility function for lambda
    x - string from `subtitles` col
    '''
    return text_process(x)

# test_text_process()

df['subtitle_clean'] = df['subtitles'].apply(
    lambda x: pruneSubtitle(x)
)

df.head()

Unnamed: 0,id,playlist,upload_date,title,view_count,average_rating,like_count,dislike_count,subtitles,sentiment,subtitle_clean
0,-2WTNSujhjk,Donald Trump Speeches & Events,20160220,Live Stream: Donald Trump Victory Rally in Spa...,4057.0,4.259259,44.0,10.0,presidents of the United States mr. go tr...,1,presid unit state mr go trapp famili thank app...
1,-64nfy6i58w,Donald Trump Speeches & Events,20161107,LAST RALLY: Donald Trump FINAL CAMPAIGN Rally ...,47276.0,4.358025,952.0,182.0,it's now officially Tuesday November a di...,1,offici tuesday novemb ever think youd hear maj...
2,-7Sp31hTxkU,Donald Trump Speeches & Events,20160423,"FULL SPEECH: Donald Trump Rally in Bridgeport,...",19966.0,4.666667,220.0,20.0,you [Music] [Music] [Music] you I...,1,music music music great countri leader free un...
3,-byuyavcNI4,Donald Trump Speeches & Events,20160617,"Full Speech: Donald Trump Rally in Houston, Te...",15138.0,4.582491,266.0,31.0,we welcome stars and president [Music] ...,1,welcom star presid music place live place wow ...
4,09BXh-AA72M,Donald Trump Speeches & Events,20161105,"Full Speech: Donald Trump Rally in Denver, Col...",8720.0,4.924731,365.0,7.0,you thank you [Music] great people Gr...,1,thank music great peopl granna three day go wi...


I chose to Lemmatize AND Stem each word since the amount of text being processed is very large, and this will also allow the models to get better results.

## Task 3
Use TFIDFVectorizer and CountVectorizer to encode the cleaned subtitles. 

In [20]:
# define the data to use
clean_subs = df['subtitle_clean']
clean_subs = list(df['subtitle_clean']) # this line might be crashing the file 
# clean_subs = df['subtitle_clean'].values # kinda pointless, parsing as list() is neater
# jupyter might require more memory to run this

print(type(clean_subs))
print(sep)
print(len(clean_subs))

<class 'list'>
836


In [21]:
# CountVectorizer
tf = tf_vectorizer.fit_transform(clean_subs)
print("CountVectorizer:")
print(tf)
print(sep)
print(tf_vectorizer.get_feature_names())

CountVectorizer:
  (0, 10936)	3
  (0, 14713)	2
  (0, 13247)	4
  (0, 9344)	2
  (0, 6118)	62
  (0, 14283)	1
  (0, 5266)	6
  (0, 13903)	16
  (0, 1321)	7
  (0, 9400)	6
  (0, 5069)	3
  (0, 15309)	9
  (0, 15197)	16
  (0, 1845)	2
  (0, 10463)	24
  (0, 13058)	7
  (0, 2683)	7
  (0, 13088)	5
  (0, 7894)	10
  (0, 6025)	11
  (0, 8391)	1
  (0, 2197)	1
  (0, 8037)	3
  (0, 15289)	3
  (0, 10657)	3
  :	:
  (835, 6309)	1
  (835, 5907)	1
  (835, 3899)	1
  (835, 9324)	1
  (835, 10230)	1
  (835, 15395)	1
  (835, 13096)	1
  (835, 15180)	1
  (835, 5439)	1
  (835, 12900)	1
  (835, 14088)	1
  (835, 13702)	1
  (835, 15211)	1
  (835, 12742)	1
  (835, 13147)	1
  (835, 2360)	1
  (835, 6762)	2
  (835, 2304)	1
  (835, 11163)	1
  (835, 9942)	1
  (835, 8754)	1
  (835, 14825)	1
  (835, 15368)	1
  (835, 9781)	1
  (835, 395)	1


In [22]:
# TFIDF 
tfidf_vec = TfidfVectorizer()
tfidf = tfidf_vec.fit_transform(corpus)
tfidf = tfidf_vec.fit_transform(clean_subs)
print("TFIDF:")
print(tfidf)
print(sep)
print(tfidf_vec.get_feature_names())


TFIDF:
  (0, 8207)	0.04722651773740116
  (0, 6678)	0.011258386438159516
  (0, 9574)	0.050821268815457704
  (0, 10834)	0.01762108325458762
  (0, 7848)	0.01332106520023026
  (0, 6275)	0.03729313259661843
  (0, 9007)	0.015151622404689681
  (0, 6366)	0.015151622404689681
  (0, 6192)	0.01619430024945148
  (0, 14481)	0.00978901656479987
  (0, 15385)	0.01305180931243145
  (0, 5058)	0.02591646195611457
  (0, 15518)	0.011448386159537732
  (0, 8833)	0.01305180931243145
  (0, 15340)	0.022610785406095294
  (0, 13923)	0.13702881131714603
  (0, 7955)	0.027924268497611062
  (0, 1293)	0.036400825861485284
  (0, 3802)	0.02848543790808502
  (0, 6859)	0.016524012488709088
  (0, 9338)	0.044483721609215335
  (0, 13356)	0.013113008967516597
  (0, 3580)	0.016456783217783974
  (0, 3234)	0.02605158945691853
  (0, 8623)	0.04348794051573086
  :	:
  (835, 12227)	0.06252048456002064
  (835, 11839)	0.009689430587479897
  (835, 6159)	0.01619555051534539
  (835, 2622)	0.02555880683624221
  (835, 9578)	0.0382725906879