In [None]:
#Utility
import pandas as pd
import numpy as np

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS as wc_stp

#Sklearn
import sklearn.feature_extraction.text
from sklearn.feature_extraction.text import TfidfVectorizer as tvect
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn . decomposition import PCA
##Metrics and Testing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
##Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
df = pd.read_csv('datasets/final_bullying_dataset.csv')

In [None]:
#Under-Sampling to rebalance the dataset
#df.loc[df['oh_label'] == 1.0].count()
dfA = df.loc[df['oh_label'] == "Not Bullying"]
dfB = df.loc[df['oh_label'] == "Bullying"]

dfA = dfA.sample(n=6000)
dfB = dfB.sample(n=5700)

df = pd.concat([dfA,dfB])

In [None]:
#Exploring the data
fig = plt.figure(figsize=(8,6))
df.groupby('oh_label').Text.count().plot.bar(ylim=0)
plt.show()

In [None]:
for class_name in ["Not Bullying","Bullying"]:
    text = ""
    df_wc = df.loc[df['oh_label'] == class_name]
    
    text = " ".join(str(x) for x in df_wc["Text"])
    
    # Generate a word cloud image
    wordcloud = WordCloud(background_color="white").generate(text)
    
    # Display the generated image:
    # the matplotlib way:
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(class_name)
    plt.show()
    
    df_wc.head()

In [None]:
#Term Frequency Inverse Document Frequency vectorization applied to append numerical values to the text based on 
#relevance to the document.
tv = tvect(min_df=.005, ngram_range=(1,2))
X = df['Text']
X = tv.fit_transform(X)
y = df['oh_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [None]:
#Testing multiple models at base configuration to see which is best for the task at hand
models = [
    MultinomialNB(),
    LogisticRegression(),
    SVC(kernel="linear"),
    KNeighborsClassifier(n_neighbors=10)
]
CV = 10
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X, y, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show(plt.figure(figsize=(25,10)))
print(cv_df.groupby('model_name').accuracy.mean())

In [None]:
X = df['Text']
#Crossfold Validation for SVC and tf-idf to tune for hyperparameters
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.77, random_state=171340)

pipeline = Pipeline([
    ('tv', tvect()),
    ('svc', SVC()),
])

params = {
    'tv__max_df': [0.8,0.85,0.9],
    'tv__min_df': [1,10,100],
    'tv__ngram_range': ((1, 1), (1, 2)),
    "svc__kernel": ["linear", "poly"],
    "svc__C": [1,10]
}

grid_search = GridSearchCV(pipeline, params, verbose=1,n_jobs=-1, cv=3)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(params)

grid_search.fit(X_train, y_train)
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
X = df['Text']
#Crossfold Validation for LogisticRegression and tf-idf to tune for hyperparameters
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.77, random_state=171340)

pipeline = Pipeline([
    ('tv', tvect()),
    ('lr', LogisticRegression()),
])

params = {
    'tv__max_df': [0.7,0.75,0.8,0.85,0.9],
    'tv__min_df': [1,10,100],
    'tv__ngram_range': ((1, 1), (1, 2)),
    "lr__solver": ['lbfgs', 'liblinear'],
    "lr__C": [100, 10, 1]
}

grid_search = GridSearchCV(pipeline, params, verbose=1,n_jobs=-1, cv=3)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(params)

grid_search.fit(X_train, y_train)
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
X = df['Text']
#Crossfold Validation for MultinomialNB and tf-idf to tune for hyperparameters
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.77, random_state=171340)

pipeline = Pipeline([
    ('tv', tvect()),
    ('mnb', MultinomialNB()),
])

params = {
    'tv__max_df': [0.7,0.75,0.8,0.85,0.9],
    'tv__min_df': [1,10,100],
    'tv__ngram_range': ((1, 1), (1, 2)),
    "mnb__alpha": [1.0, .9, .8, .7, .6]
}

grid_search = GridSearchCV(pipeline, params, verbose=1,n_jobs=-1, cv=3)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(params)

grid_search.fit(X_train, y_train)
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
#Creating the final TF-IDF Vectoriser & the final train/test set
tv = tvect(max_df=0.7,min_df=1,ngram_range=(1,2))
X = tv.fit_transform(df['Text'])
y = df['oh_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 12345)

In [None]:
#Creating the final SVC Model
model = SVC(C=1,kernel='linear',probability=True)

model.fit(X_train,y_train)
print(model.score(X_test,y_test))
ypred = model.predict(X_test)

In [None]:
print("METRICS FOR SVC")
print(metrics.classification_report(y_test, ypred, target_names=df['oh_label'].unique()))
y_true = y_test
y_pred = ypred
data = confusion_matrix(y_true, y_pred)
df_cm = pd.DataFrame(data, columns=np.unique(y_true), index = np.unique(y_true))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16}, fmt="g")# font size

In [None]:
#Creating the final Linear Regression Model
lr_model = LogisticRegression(C=100,solver='lbfgs')

lr_model.fit(X_train,y_train)
print(lr_model.score(X_test,y_test))
ypred = lr_model.predict(X_test)

In [None]:
print("METRICS FOR Logistic Regression")
print(metrics.classification_report(y_test, ypred, target_names=df['oh_label'].unique()))
y_true = y_test
y_pred = ypred
data = confusion_matrix(y_true, y_pred)
df_cm = pd.DataFrame(data, columns=np.unique(y_true), index = np.unique(y_true))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16}, fmt="g")# font size

In [None]:
#Creating the final MultinomialNB model
nb_model = MultinomialNB(alpha=0.6)

nb_model.fit(X_train,y_train)
print(nb_model.score(X_test,y_test))
ypred = nb_model.predict(X_test)

In [None]:
print("METRICS FOR MultinomialNB")
print(metrics.classification_report(y_test, ypred, target_names=df['oh_label'].unique()))
y_true = y_test
y_pred = ypred
data = confusion_matrix(y_true, y_pred)
df_cm = pd.DataFrame(data, columns=np.unique(y_true), index = np.unique(y_true))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16}, fmt="g")# font size

In [None]:
print('SVC model score = ',model.score(X_test,y_test))
print('LogisticRegression model score = ',lr_model.score(X_test,y_test))
print('MultinomialNB model score = ',nb_model.score(X_test,y_test))