In [56]:
import pandas as pd
import numpy as np
import copy
from tqdm import tqdm
import pprint

pp = pprint.PrettyPrinter(indent=5)

In [57]:
print("reading data set....")
training_data_set = pd.read_csv("/Users/prajwalkrishn/Desktop/My_Computer/project - Dsci 601/Offensive_Tweet_Detection/Dataset/MOLID.csv")
print("Done reading....")


reading data set....
Done reading....


In [58]:
training_data_set.head(5)

Unnamed: 0,tweet,subtask_a,subtask_b,subtask_c
0,This is Dubai's like Michael 's phone went pud...,Offensive,UNT,
1,"In fact, never was perceived to be thrown. Eve...",Offensive,TIN,IND
2,Bhosadi I am your mother's husband. Look at yo...,Offensive,TIN,IND
3,If you ask a dog? And the smoke is drawn throu...,Offensive,TIN,IND
4,Where's Ram Kadam went to talk to the BJP and ...,Offensive,TIN,GRP


In [59]:
tweets = training_data_set[["tweet"]]
level_A_labels = training_data_set[["subtask_a"]]
level_B_labels = training_data_set.query("subtask_a == 'Offensive'")[["subtask_b"]]
level_C_labels = training_data_set.query("subtask_b == 'TIN'")[["subtask_c"]]

All_Cleaned_tweets = copy.deepcopy(tweets)

In [60]:
##Data Cleaning and Pre-Processing

In [61]:
tweets.head(5)

Unnamed: 0,tweet
0,This is Dubai's like Michael 's phone went pud...
1,"In fact, never was perceived to be thrown. Eve..."
2,Bhosadi I am your mother's husband. Look at yo...
3,If you ask a dog? And the smoke is drawn throu...
4,Where's Ram Kadam went to talk to the BJP and ...


In [62]:
level_A_labels.head(5)

Unnamed: 0,subtask_a
0,Offensive
1,Offensive
2,Offensive
3,Offensive
4,Offensive


In [63]:
level_B_labels.head(5)

Unnamed: 0,subtask_b
0,UNT
1,TIN
2,TIN
3,TIN
4,TIN


In [64]:
level_C_labels.head(5)

Unnamed: 0,subtask_c
1,IND
2,IND
3,IND
4,GRP
5,IND


In [65]:
All_Cleaned_tweets.head(5)

Unnamed: 0,tweet
0,This is Dubai's like Michael 's phone went pud...
1,"In fact, never was perceived to be thrown. Eve..."
2,Bhosadi I am your mother's husband. Look at yo...
3,If you ask a dog? And the smoke is drawn throu...
4,Where's Ram Kadam went to talk to the BJP and ...


In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer,WordNetLemmatizer
lancaster = LancasterStemmer()
wordNet = WordNetLemmatizer()

In [None]:
def remove_webTags_UserNames_Noise(tweet):
    things_to_be_removed_from_tweets = ['URL','@USER','\'ve','n\'t','\'s','\'m']
    
    for things in things_to_be_removed_from_tweets:
        tweet = tweet.replace(things,'')
    
    return re.sub(r'[^a-zA-Z]', ' ', tweet)

def tokenize(tweet):
    lower_cased_tweet = tweet.lower()
    return word_tokenize(lower_cased_tweet)

def stop_words_removal(tokens):
    cleaned_tokens = []
    stop = set(stopwords.words('english'))
    for token in tokens:
        if token not in stop:
            if token.replace(' ','') != '':
                if len(token) > 1:
                    cleaned_tokens.append(token)
    return cleaned_tokens

def stemming(tokens):
    cleaned_tokens = []
    for token in tokens:
        token = lancaster.stem(token)
        if len(token) > 1:
            cleaned_tokens.append(token)
    return cleaned_tokens

def lemmatization(tokens):
    cleaned_tokens = []
    for token in tokens:
        token = wordNet.lemmatize(token)
        if len(token) > 1:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [None]:
tqdm.pandas(desc = "clean...")
All_Cleaned_tweets['tweet'] = tweets['tweet'].progress_apply(remove_webTags_UserNames_Noise)

tqdm.pandas(desc="Tokenize..")
All_Cleaned_tweets['tokens'] = All_Cleaned_tweets['tweet'].progress_apply(tokenize)

tqdm.pandas(desc="remove STOPWORDS...")
All_Cleaned_tweets['tokens'] = All_Cleaned_tweets['tokens'].progress_apply(stop_words_removal)

tqdm.pandas(desc="Stemming...")
All_Cleaned_tweets['tokens'] = All_Cleaned_tweets['tokens'].progress_apply(stemming)

tqdm.pandas(desc="Lemmatize...")
All_Cleaned_tweets['tokens'] = All_Cleaned_tweets['tokens'].progress_apply(lemmatization)

text_vector = All_Cleaned_tweets['tokens'].tolist()

In [None]:
All_Cleaned_tweets.head(5)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfid(text_vector):
    vectorizer = TfidfVectorizer()
    untokenized_data =[' '.join(tweet) for tweet in tqdm(text_vector, "Vectorizing...")]
    vectorizer = vectorizer.fit(untokenized_data)
    vectors = vectorizer.transform(untokenized_data).toarray()
    return vectors
  
def get_vectors(vectors, labels, keyword):
    if len(vectors) != len(labels):
        print("Unmatching sizes!")
        return
    result = list()
    for vector, label in zip(vectors, labels):
        if label == keyword:
            result.append(vector)
    return result

In [None]:
vectors_level_a = tfid(text_vector) # Numerical Vectors A
labels_level_a = level_A_labels['subtask_a'].values.tolist() # Subtask A Labels

vectors_level_b = get_vectors(vectors_level_a, labels_level_a, "Offensive") # Numerical Vectors B
labels_level_b = level_B_labels['subtask_b'].values.tolist() # Subtask B Labels

vectors_level_c = get_vectors(vectors_level_b, labels_level_b, "TIN") # Numerical Vectors C
labels_level_c = level_C_labels['subtask_c'].values.tolist() # Subtask C Labels

In [None]:
pp.pprint(vectors_level_a)

In [None]:
pp.pprint(labels_level_c)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import warnings
# print("splitting and fitting on level A annotations....")
# train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors_level_a[:], labels_level_a[:], train_size=0.70)
# print("split done...")

train_vectors_b, test_vectors_b, train_labels_b, test_labels_b = train_test_split(vectors_level_a[:], labels_level_a[:], train_size=0.75)

print("fit begins...")
warnings.filterwarnings(action='ignore')
classifier = DecisionTreeClassifier(max_depth=800, min_samples_split=5)
params = {'criterion':['gini','entropy']}
classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
classifier.fit(train_vectors_b, train_labels_b)
classifier = classifier.best_estimator_
print("fit complete....")

print("calculating accuracy....")
accuracy = accuracy_score(train_labels_b, classifier.predict(train_vectors_b))
print("Training Accuracy:", accuracy)
test_predictions = classifier.predict(test_vectors_b)
accuracy = accuracy_score(test_labels_b, test_predictions)
print("Test Accuracy:", accuracy)
print("Confusion Matrix:", )
print(confusion_matrix(test_labels_b, test_predictions))
print(classification_report(test_labels_b,test_predictions))

In [None]:
print("SVM model experiment begins ...")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors_level_a[:], labels_level_a[:], train_size=0.70)

print("fit begins...")
warnings.filterwarnings(action='ignore')
classifier = LinearSVC()
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
classifier.fit(train_vectors, train_labels)
classifier = classifier.best_estimator_
print("fit complete....")