In [1]:
# Importing libraries:

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sanket\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('train_cleaned.csv')
#df.drop(['id'],axis=1, inplace=True)
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None) # tto display full length text of column.
df

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so selfi...
1,2,0,thanks for lyft credit i can't use cause they ...
2,3,0,bihday your majesty
3,5,0,factsguide society now motivation
4,6,0,[2 2] huge fan fare and big talking before the...
...,...,...,...
29525,31958,0,ate isz that youuu?
29526,31959,0,to see nina turner on the airwaves trying to w...
29527,31960,0,listening to sad songs on a monday morning otw...
29528,31961,1,sikh temple vandalised in in calgary wso conde...


# Labels are as follows:
label '1' ---> racist/sexist tweet           
label '0' ---> not racist/sexist tweet

In [16]:
print("Dataset shape: ", df.shape)
df['label'].value_counts()

Dataset shape:  (29530, 3)


0    27517
1     2013
Name: label, dtype: int64

# Cleaning data:

### Cleaning and removing Punctuations:

In [5]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
df['tweet'] = df['tweet'].astype(str)
punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['tweet'] = df['tweet'].apply(lambda x: cleaning_punctuations(x))

### Removing Stopwords:

In [7]:
sw = stopwords.words('english')
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

### Removing Numeric numbers:

In [8]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)

df['tweet'] = df['tweet'].apply(lambda text: cleaning_numbers(text))

### Tokenizing Tweets:

In [9]:
tokens = (word_tokenize(i) for i in df.tweet)
df['tweet'] = df['tweet'].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,id,label,tweet
0,1,0,"[father, dysfunctional, selfish, drags, kids, ..."
1,2,0,"[thanks, lyft, credit, cant, use, cause, dont,..."
2,3,0,"[bihday, majesty]"
3,5,0,"[factsguide, society, motivation]"
4,6,0,"[huge, fan, fare, big, talking, leav, chaos, p..."


### Stemming:

In [10]:
stemm = SnowballStemmer('english')
df['tweet'] = df['tweet'].apply(lambda x: [stemm.stem(y) for y in x])

### Splitting data into Train and Test sets

In [11]:
X = df['tweet'].astype(str)  # Converting to string, because vectorizer does'nt accept list.
y = df['label'].astype(str)  # Converting to string, because vectorizer does'nt accept list.
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

# Transforming Dataset using TF-IDF Vectorizer

### Fitting the Count Vectorizer

### Fitting the TF-IDF Vectorizer

In [12]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

No. of feature_words:  154710


### Transforming the data using TF-IDF Vectorizer

In [13]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

In [14]:
# making a dictionary with four models with some parameters:
model_params = {
    
    'SVC' :{
        'model' : SVC(),
        'params' : {
            'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf','linear','poly','sigmoid']
        }
    },

    'MultinomialNB' :{
        'model' : MultinomialNB(),
        'params' : {
            'alpha' : np.linspace(0.5, 1.5, 6), 'fit_prior' : [True, False]
        }
    },
    
    'logistics_regression' :{
        'model' : LogisticRegression(solver = 'lbfgs', multi_class = 'auto'),
        'params' : {
            'C' : [0.1, 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'solver' : ['lbfgs', 'liblinear']
        }
    },
    
    'random_forest' :{
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : [1,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100,200,300,400], 
            'max_depth':[20,30,None], 'criterion':['gini','entropy']
        }
    }
}

In [15]:
# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.

scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nFitting...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

SVC()

Fitting...
Fitting 5 folds for each of 80 candidates, totalling 400 fits
[[5377  113]
 [ 142  274]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      5490
           1       0.71      0.66      0.68       416

    accuracy                           0.96      5906
   macro avg       0.84      0.82      0.83      5906
weighted avg       0.96      0.96      0.96      5906


Score is appended.

MultinomialNB()

Fitting...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[[5490    0]
 [ 367   49]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      5490
           1       1.00      0.12      0.21       416

    accuracy                           0.94      5906
   macro avg       0.97      0.56      0.59      5906
weighted avg       0.94      0.94      0.91      5906


Score is appended.

LogisticRegression()

Fitting...
Fitting 5 folds for each of 24 candidates, total

Unnamed: 0,model,best_score,best_params
0,SVC,0.956824,"{'C': 10, 'gamma': 1, 'kernel': 'sigmoid'}"
1,MultinomialNB,0.93786,"{'alpha': 0.5, 'fit_prior': True}"
2,logistics_regression,0.960041,"{'C': 100, 'solver': 'liblinear'}"
3,random_forest,0.95513,"{'criterion': 'gini', 'max_depth': None, 'n_es..."


# Conclusion:
From above result, it can be concluded that **SVC has better precision, recall and F1-score for 'Toxic' label**. So, I'm going to train **SVC model** with 100% train data, so that it get more better training and will use test set to predict & label them and will use that file for submission. Creating a new notebook for this.

Note: I have also used **'Count Vectorizer -> ngram_range=(1,1): unigrams and ngram_range=(1,2): unigram - bigrams'**, that gave very close result to **'TF-IDF Vectorizer -> ngram_range=(1,1): unigrams and ngram_range=(1,2): unigram - bigrams'**, however, **'TF-IDF with ngram_range=(1,2): unigram - bigrams'** turns out to be little bit better than **'Count Vectorizer -> ngram_range=(1,1): unigrams and ngram_range=(1,2): unigram - bigrams'** and **'TF-IDF ngram_range=(1,1): unigram'**.