In [1]:
import pandas as pd
import sklearn as skl
import numpy as np
import json
from pycontractions import Contractions



# The Data

In [2]:
with open() as json_data:
    data = json.load(json_data)

dataframe = pd.DataFrame(data["messages"])

## Cleaning the Data

Removing unnecessary column, null data, and any comments that are only weblinks.

In [3]:
dataframe.columns = ['audio_files', 'content', 'files', 'gifs', 'photos', 'plan',
       'reactions', 'sender_name', 'share', 'sticker', 'epoch time', 'type',
       'videos']

dataframe = dataframe.drop(labels = ['audio_files', 'files', 'gifs', 'plan', 'reactions', 'share',
                                     'sticker', 'type', 'videos', 'photos'], axis = 1)

In [4]:
dataframe = dataframe[~dataframe['content'].isnull()]

In [5]:
dataframe = dataframe[~dataframe["content"].str.contains("http")]

# Basic NLP

SIA was used to calculate polarity for the group infographic.

In [6]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [7]:
sia = SIA()

def compound_polarity(text):
    scores = sia.polarity_scores(text)
    return scores['compound']

dataframe['polarity'] = dataframe['content'].apply(compound_polarity)

#dataframe

In [8]:
grouped = dataframe.groupby("sender_name")
grouped["polarity"].mean()

sender_name
Ali Javed                  0.035944
Ali Nawed                  0.064050
Aushvin Vasanthakumaran    0.009103
Diahanna Rose              0.059919
Farhin Chowdhury           0.000000
Hamza Ahsan                0.022621
Harris Ismael              0.088019
Janusshan Paramasivam     -0.015022
Joey Bot                   0.121728
Kishan Baskaran            0.067994
Pavi Subenderan            0.035082
Rogges Anandarajah         0.041745
Saad Daas                  0.031565
Sabil Ahmed                0.118182
Thannoj Thavalingam        0.064037
Vinoth Maruthalingam       0.063329
Name: polarity, dtype: float64

In [9]:
import nltk

The next 4 cells run slow, especially the first two. For whatever reason, creating turning remove_contractions into a list from its original generator form takes inordinately long; like 30-45 minutes long despite a single object only taking miliseconds. Merits further investigation but it did deliver the data which is the expanded form of several contractions both formal and sland English.

Update: On running out the generator, a massive delay occurs following the message "if you cannot play defense you should not be in discussion for top 15 even". Probably worth looking into.

Update 2: The csv file of raw data and other processing showed nothing suspicious at that point. Weird...

In [10]:
cont = Contractions('GoogleNews-vectors-negative300.bin')
cont.load_models()

''.join(list(cont.expand_texts(["Who's gonna kill u"], precise = True)))

'who is going to kill you'

In [16]:
remove_contractions = cont.expand_texts(list(dataframe['content']), precise = True)

remove_contractions = list(remove_contractions)

In [None]:
remove_contractions = [str(text) for text in remove_contractions]

dataframe["new_content"] = remove_contractions

dataframe

In [None]:
remove_contractions

Single use cases for tokenizing and stemming words.

In [18]:
nltk.word_tokenize(dataframe["new_content"].iloc[0])

['Yea', 'def', 'not', 'worth']

In [19]:
a = nltk.stem.WordNetLemmatizer()
a.lemmatize("mice")

'mouse'

The PorterStemmer doesn't work on curse words.

In [20]:
b = nltk.PorterStemmer()
b.stem("fucker")

'fucker'

In [21]:
c = nltk.word_tokenize(dataframe['new_content'].iloc[1422])
[a.lemmatize(word) for word in c]

['But', 'today', 'like', 'come', 'on']

Stopwords to remove, so more substantial language can be examined.

In [22]:
stopwords = nltk.corpus.stopwords.words('english')

lemmstop combines generalized forms of all the single use cases above.

In [23]:
def lemmstop(text):
    token = nltk.word_tokenize(text)
    token = [word.lower() for word in token if word.isalpha()]
    no_stop = [word for word in token if word not in stopwords]
    lemm = [a.lemmatize(word) for word in no_stop]
    return lemm

In [39]:
dataframe['lemmatized'] = dataframe['new_content'].apply(lemmstop)

In [None]:
dataframe

repeat counts up instances of words where the same letter is repeated two or more times. Those that occur 70 or more times will be included for further analysis. E.g. lmaoooo is the same as lmao

In [None]:
import re 

repeated_letters = {}

def repeat(word_list):
    for word in word_list:
        word = word.lower()
        if re.findall(r'((\w)\2{2,})', word):
            if word in repeated_letters:
                repeated_letters[word] += 1
            else:
                repeated_letters[word] = 1

dataframe['lemmatized'].apply(func = repeat)

sorted_letter_repeats = sorted(repeated_letters.items(), key=op.itemgetter(1))

sorted_letter_repeats[::-1]

word_counter is to count up the amount of words. multi_letter_exceptions handles the instances of the same word being spelt with varying amounts of the same letter.

In [None]:
import operator as op
word_counts = {}

def word_counter(word_list):
    for word in word_list:
        multi_letter_exceptions = {'loool': 'lol', 'lmaooo': 'lmao', 'looool': "lol",
                                  'lmaoooo': "lmao", 'yeahhh': 'yeah', 'ohhh': "oh",
                                  'yeee': "ye"}
        if word in multi_letter_exceptions:
            word = multi_letter_exceptions[word]
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1

dataframe['lemmatized'].apply(func = word_counter)

sorted_word_counts = sorted(word_counts.items(), key=op.itemgetter(1))

sorted_word_counts[::-1]

More work to follow as time permits; topic extraction is slated next. 