In [1]:
import pandas as pd
import sklearn as skl
import numpy as np
import json
from pycontractions import Contractions



# The Data

In [2]:
with open() as json_data:
    data = json.load(json_data)

dataframe = pd.DataFrame(data["messages"])

## Cleaning the Data

Removing unnecessary column, null data, and any comments that are only weblinks.

In [3]:
dataframe.columns = ['audio_files', 'content', 'files', 'gifs', 'photos', 'plan',
       'reactions', 'sender_name', 'share', 'sticker', 'epoch time', 'type',
       'videos']

dataframe = dataframe.drop(labels = ['audio_files', 'files', 'gifs', 'plan', 'reactions', 'share',
                                     'sticker', 'type', 'videos', 'photos'], axis = 1)

In [4]:
dataframe = dataframe[~dataframe['content'].isnull()]

In [5]:
dataframe = dataframe[~dataframe["content"].str.contains("http")]

# Basic NLP

SIA was used to calculate polarity for the group infographic.

In [6]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [7]:
sia = SIA()

def compound_polarity(text):
    scores = sia.polarity_scores(text)
    return scores['compound']

dataframe['polarity'] = dataframe['content'].apply(compound_polarity)

#dataframe

In [8]:
grouped = dataframe.groupby("sender_name")
grouped["polarity"].mean()

sender_name
Ali Javed                  0.035944
Ali Nawed                  0.064050
Aushvin Vasanthakumaran    0.009103
Diahanna Rose              0.059919
Farhin Chowdhury           0.000000
Hamza Ahsan                0.022621
Harris Ismael              0.088019
Janusshan Paramasivam     -0.015022
Joey Bot                   0.121728
Kishan Baskaran            0.067994
Pavi Subenderan            0.035082
Rogges Anandarajah         0.041745
Saad Daas                  0.031565
Sabil Ahmed                0.118182
Thannoj Thavalingam        0.064037
Vinoth Maruthalingam       0.063329
Name: polarity, dtype: float64

In [9]:
import nltk

The next 4 cells run slow, especially the first two. For whatever reason, creating turning remove_contractions into a list from its original generator form takes inordinately long; like 30-45 minutes long despite a single object only taking miliseconds. Merits further investigation but it did deliver the data which is the expanded form of several contractions both formal and sland English.

Update: On running out the generator, a massive delay occurs following the message "if you cannot play defense you should not be in discussion for top 15 even". Probably worth looking into.

Update 2: The csv file of raw data and other processing showed nothing suspicious at that point. Weird...

In [10]:
cont = Contractions('GoogleNews-vectors-negative300.bin')
cont.load_models()

''.join(list(cont.expand_texts(["Who's gonna kill u"], precise = True)))

'who is going to kill you'

In [16]:
remove_contractions = cont.expand_texts(list(dataframe['content']), precise = True)

remove_contractions = list(remove_contractions)

In [38]:
remove_contractions = [str(text) for text in remove_contractions]

dataframe["new_content"] = remove_contractions

dataframe

Unnamed: 0,content,sender_name,epoch time,polarity,new_content
0,Yea def not worth,Rogges Anandarajah,1531451468,-0.1695,Yea def not worth
1,"Damn Kawhi Lite, White Tim Duncan, and DD",Rogges Anandarajah,1531451453,-0.4019,"Damn Kawhi Lite, White Tim Duncan, and DD"
2,Wtf,Aushvin Vasanthakumaran,1531451445,-0.5859,Wtf
3,At all,Aushvin Vasanthakumaran,1531451444,0.0000,At all
4,That IS NOT WORTH,Aushvin Vasanthakumaran,1531451442,0.3885,That IS NOT WORTH
5,SEND ME,Aushvin Vasanthakumaran,1531451438,0.0000,SEND ME
6,Rumours,Janusshan Paramasivam,1531451391,0.0000,Rumours
7,Do you see this,Aushvin Vasanthakumaran,1531451279,0.0000,Do you see this
8,Where,Aushvin Vasanthakumaran,1531451275,0.0000,Where
9,Apparently,Janusshan Paramasivam,1531451037,0.0000,Apparently


In [37]:
remove_contractions

['Yea def not worth',
 'Damn Kawhi Lite, White Tim Duncan, and DD',
 'Wtf',
 'At all',
 'That IS NOT WORTH',
 'SEND ME',
 'Rumours',
 'Do you see this',
 'Where',
 'Apparently',
 'For kawaii',
 'DeRozan og and poetl',
 'But it must have some kind of substance',
 'it is not guaranteed',
 'Those are the guys that had la as favorite for LeBron',
 'Imagine kawahi on raptors',
 'Damn',
 'Just a heads up',
 'we will see',
 'Well there is still no word on what is going on',
 'Few big cities',
 'Who have luckily gone untargeted :/',
 'it is unsubstantiated atm',
 'is not Toronto one of the few cities',
 'that is scary as shit',
 'Wtffff',
 'it will probably be nothing but if you are Dt or somewhere tourists would go watch yourself',
 'Yo apparently someone called in a threat to the police',
 'lmao top comment',
 'I thought plastic surgery has more money',
 'maybe good money idk',
 'yo tf wants to be ahgyno wtf',
 'Heard she liked me',
 'Yeah I tried to overcharge still',
 'Looool',
 'Tried imp

Single use cases for tokenizing and stemming words.

In [18]:
nltk.word_tokenize(dataframe["new_content"].iloc[0])

['Yea', 'def', 'not', 'worth']

In [19]:
a = nltk.stem.WordNetLemmatizer()
a.lemmatize("mice")

'mouse'

The PorterStemmer doesn't work on curse words.

In [20]:
b = nltk.PorterStemmer()
b.stem("fucker")

'fucker'

In [21]:
c = nltk.word_tokenize(dataframe['new_content'].iloc[1422])
[a.lemmatize(word) for word in c]

['But', 'today', 'like', 'come', 'on']

Stopwords to remove, so more substantial language can be examined.

In [22]:
stopwords = nltk.corpus.stopwords.words('english')

lemmstop combines generalized forms of all the single use cases above.

In [23]:
def lemmstop(text):
    token = nltk.word_tokenize(text)
    token = [word.lower() for word in token if word.isalpha()]
    no_stop = [word for word in token if word not in stopwords]
    lemm = [a.lemmatize(word) for word in no_stop]
    return lemm

In [39]:
dataframe['lemmatized'] = dataframe['new_content'].apply(lemmstop)

In [40]:
dataframe

Unnamed: 0,content,sender_name,epoch time,polarity,new_content,lemmatized
0,Yea def not worth,Rogges Anandarajah,1531451468,-0.1695,Yea def not worth,"[yea, def, worth]"
1,"Damn Kawhi Lite, White Tim Duncan, and DD",Rogges Anandarajah,1531451453,-0.4019,"Damn Kawhi Lite, White Tim Duncan, and DD","[damn, kawhi, lite, white, tim, duncan, dd]"
2,Wtf,Aushvin Vasanthakumaran,1531451445,-0.5859,Wtf,[wtf]
3,At all,Aushvin Vasanthakumaran,1531451444,0.0000,At all,[]
4,That IS NOT WORTH,Aushvin Vasanthakumaran,1531451442,0.3885,That IS NOT WORTH,[worth]
5,SEND ME,Aushvin Vasanthakumaran,1531451438,0.0000,SEND ME,[send]
6,Rumours,Janusshan Paramasivam,1531451391,0.0000,Rumours,[rumour]
7,Do you see this,Aushvin Vasanthakumaran,1531451279,0.0000,Do you see this,[see]
8,Where,Aushvin Vasanthakumaran,1531451275,0.0000,Where,[]
9,Apparently,Janusshan Paramasivam,1531451037,0.0000,Apparently,[apparently]


repeat counts up instances of words where the same letter is repeated two or more times. Those that occur 70 or more times will be included for further analysis. E.g. lmaoooo is the same as lmao

In [42]:
import re 

repeated_letters = {}

def repeat(word_list):
    for word in word_list:
        word = word.lower()
        if re.findall(r'((\w)\2{2,})', word):
            if word in repeated_letters:
                repeated_letters[word] += 1
            else:
                repeated_letters[word] = 1

dataframe['lemmatized'].apply(func = repeat)

sorted_letter_repeats = sorted(repeated_letters.items(), key=op.itemgetter(1))

sorted_letter_repeats[::-1]

[('loool', 423),
 ('lmaooo', 207),
 ('looool', 176),
 ('ooo', 140),
 ('lmaoooo', 83),
 ('yeahhh', 82),
 ('loooool', 80),
 ('ohhh', 79),
 ('yeee', 74),
 ('yooo', 58),
 ('oooo', 49),
 ('ayyy', 43),
 ('damnnn', 41),
 ('wtfff', 38),
 ('worddd', 36),
 ('looooool', 35),
 ('truuu', 35),
 ('hmmm', 32),
 ('wtffff', 32),
 ('brooo', 31),
 ('yeahhhh', 29),
 ('omggg', 29),
 ('yoooo', 27),
 ('okkk', 24),
 ('lmfaooo', 23),
 ('loooooool', 23),
 ('wordddd', 23),
 ('lmaooooo', 22),
 ('omgggg', 21),
 ('sooo', 21),
 ('ooooo', 19),
 ('eyyy', 19),
 ('omggggg', 19),
 ('ayeee', 18),
 ('damnnnn', 18),
 ('ohhhh', 18),
 ('looooooool', 17),
 ('soooo', 17),
 ('ggg', 17),
 ('jeeez', 15),
 ('lmfaoooo', 15),
 ('ummm', 14),
 ('gggg', 14),
 ('yeeee', 13),
 ('broooo', 13),
 ('wtfffff', 13),
 ('loooooooool', 12),
 ('loooooooooool', 11),
 ('ayyyy', 11),
 ('uhhh', 11),
 ('nooo', 11),
 ('jeeeez', 10),
 ('ggggg', 10),
 ('truuuu', 9),
 ('lmfaooooo', 9),
 ('trueee', 9),
 ('brooooo', 9),
 ('nawww', 9),
 ('holyyy', 8),
 ('sooooo

word_counter is to count up the amount of words. multi_letter_exceptions handles the instances of the same word being spelt with varying amounts of the same letter.

In [41]:
import operator as op
word_counts = {}

def word_counter(word_list):
    for word in word_list:
        multi_letter_exceptions = {'loool': 'lol', 'lmaooo': 'lmao', 'looool': "lol",
                                  'lmaoooo': "lmao", 'yeahhh': 'yeah', 'ohhh': "oh",
                                  'yeee': "ye"}
        if word in multi_letter_exceptions:
            word = multi_letter_exceptions[word]
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1

dataframe['lemmatized'].apply(func = word_counter)

sorted_word_counts = sorted(word_counts.items(), key=op.itemgetter(1))

sorted_word_counts[::-1]

[('like', 4942),
 ('yo', 3564),
 ('get', 3128),
 ('lol', 2824),
 ('bro', 2655),
 ('guy', 2567),
 ('lmao', 2514),
 ('damn', 2455),
 ('one', 2427),
 ('good', 2292),
 ('got', 2196),
 ('going', 2041),
 ('know', 2039),
 ('yeah', 1878),
 ('think', 1876),
 ('shit', 1846),
 ('oh', 1812),
 ('rogges', 1711),
 ('man', 1628),
 ('ali', 1610),
 ('go', 1590),
 ('would', 1576),
 ('yea', 1534),
 ('saad', 1474),
 ('time', 1446),
 ('pavi', 1283),
 ('need', 1257),
 ('make', 1160),
 ('really', 1138),
 ('wtf', 1062),
 ('see', 1049),
 ('still', 1048),
 ('janu', 1032),
 ('much', 999),
 ('work', 996),
 ('want', 977),
 ('day', 974),
 ('look', 963),
 ('javed', 959),
 ('word', 955),
 ('game', 944),
 ('even', 928),
 ('right', 928),
 ('kishan', 927),
 ('year', 914),
 ('back', 913),
 ('fuck', 897),
 ('pretty', 896),
 ('also', 854),
 ('thing', 849),
 ('nigga', 825),
 ('well', 820),
 ('say', 817),
 ('take', 814),
 ('hamza', 791),
 ('sure', 791),
 ('av', 780),
 ('actually', 777),
 ('true', 759),
 ('omg', 750),
 ('wait'

More work to follow as time permits; topic extraction is slated next. 