In [1]:
#Algorithm that uses NLP and ML to determine if a comment made on social media (specifically reddit)
#Data was gathered from Kaggle's Reddit May 2015 hosted data off of a sql server
#Required packages: jupyter notebook, numpy, pandas, sklearn
#If no jupyter, run redditBullies.py in terminal.
import pandas as pd
import numpy as np

controversiality = pd.read_csv('controversiality.csv', error_bad_lines = False)
noncontroversiality = pd.read_csv('non-controversiality.csv', error_bad_lines = False)
#combine the controversial and noncontroversial data
data = pd.concat((controversiality, noncontroversiality), axis=0, ignore_index=True)

In [2]:
simple = pd.concat([data['controversiality'], data['body']], axis=1, keys=['author', 'body'])
#drop the null data
simple[pd.isnull(simple).any(axis=1)]
simple = simple.drop(simple.index[[7572,97008]])
simple.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99998 entries, 0 to 99999
Data columns (total 2 columns):
author    99998 non-null int64
body      99998 non-null object
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


In [3]:
#eliminate non-alpha characters
simple['body'].replace(regex=True,inplace=True,to_replace=r'([^\s\w]|_)+',value=r'')
simple['body'].replace(regex=True,inplace=True,to_replace=r'/s|\n',value=r'')
print(simple.head(10))

   author                                               body
0       1  Because we arent responsible for the actions o...
1       1  I honestly wouldnt have believed it if I didnt...
2       1  There is also many intelligence service player...
3       1  The implications of that varies between cultur...
4       1  I am a bot whose sole purpose is to improve th...
5       1  Youre just trying to get to the front page  I ...
6       1  For those about to lynch this guy herehttpnere...
7       1  True though you can find papers about early ch...
8       1  Yes After days of dealing with car accidents a...
9       1  Icefrog went fucking mad I have literally no i...


In [4]:
#Create test and training set
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(simple.body, 
                                                                                                 simple.author, 
                                                                                                 test_size=0.2, 
                                                                                                 random_state=42)



In [5]:
#create vectorizer for feature selection
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df = 0.1,
                               stop_words='english')

In [6]:
#transform the reddit comments into tuples of words and their frequency of occurence
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()

In [7]:
#use only the top 100 features
features_train = features_train[:100].toarray()
labels_train   = labels_train[:100]

In [8]:
#Differente machine learning models used
from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# from sklearn.ensemble import RandomForestClassifier
# from xgboost.sklearn import XGBClassifier

In [10]:
#Run model, get accuracy
clf = DecisionTreeClassifier()
# clf = LogisticRegression()
# clf = RandomForestClassifier()
# clf = xgb.fit(features_train, labels_train)
# clf = XGBClassifier(max_depth=6,
#                     learning_rate=0.1,
#                     )
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("Accuracy:", accuracy_score(labels_test, pred)) #Beat random guessing!!!

Accuracy: 0.5338


In [11]:
#rank most important words that determine if a comment is controversial or not
importances = clf.feature_importances_
import numpy as np
indices = np.argsort(importances)[::-1]
print('Feature Ranking: ')
for i in range(10):
    print("{} feature no.{} ({}) {}".format(i+1,indices[i],
                                            importances[indices[i]], 
                                            vectorizer.get_feature_names()[indices[i]]))

Feature Ranking: 
1 feature no.65095 (0.0783410138248849) people
2 feature no.6157 (0.04249097066873295) allowed
3 feature no.59520 (0.04016269830332278) nfl
4 feature no.76674 (0.03987802017358663) say
5 feature no.74624 (0.039231378826040254) rmensrights
6 feature no.72031 (0.03802068772714565) reddit
7 feature no.95182 (0.03772247964042327) wallets
8 feature no.36622 (0.037305244678516467) good
9 feature no.13052 (0.03629898984267144) booooooo
10 feature no.12095 (0.03604558706599522) bit
