In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sanket\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sanket\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
#df.drop(['id'],axis=1, inplace=True)
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Labels are as follows:
label '1' ---> racist/sexist tweet           
label '0' ---> not racist/sexist tweet

In [3]:
# Checking Shape of Train and Test sets:
print("Train shape: ", df.shape)
print("Test shape: ", test.shape)

Train shape:  (29530, 3)
Test shape:  (17197, 2)


# Cleaning data:

### Cleaning and removing Punctuations:

In [4]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['tweet'] = df['tweet'].astype(str)
df['tweet'] = df['tweet'].apply(lambda x: cleaning_punctuations(x))

test['tweet'] = test['tweet'].astype(str)
test['tweet'] = test['tweet'].apply(lambda x: cleaning_punctuations(x))

### Removing Stopwords:

In [6]:
sw = stopwords.words('english')

df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))
test['tweet'] = test['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

### Removing Numeric numbers:

In [7]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)

df['tweet'] = df['tweet'].apply(lambda text: cleaning_numbers(text))
test['tweet'] = test['tweet'].apply(lambda text: cleaning_numbers(text))

### Tokenizing Tweets:

In [8]:
tokens = (word_tokenize(i) for i in df.tweet)
df['tweet'] = df['tweet'].apply(nltk.word_tokenize)

tokens = (word_tokenize(i) for i in test.tweet)
test['tweet'] = test['tweet'].apply(nltk.word_tokenize)

### Stemming:

In [9]:
stemm = SnowballStemmer('english')

df['tweet'] = df['tweet'].apply(lambda x: [stemm.stem(y) for y in x])
test['tweet'] = test['tweet'].apply(lambda x: [stemm.stem(y) for y in x])

### Splitting data into Train and Test sets

In [10]:
# Not spliiting, Creating X_train and y_train.
# Using 100% data for training SVC model to get better training. Because from Step - 2,
# it can be concluded that SVC model with 'TF-IDF Vectorizer (1,2) - unigrams and bigrams' performs best for this dataset


X_train = df['tweet'].astype(str)    # Converting to string, because vectorizer does'nt accept list.
y_train = df['label'].astype(str)    # Converting to string, because vectorizer does'nt accept list.
X_test = test['tweet'].astype(str)   # Converting to string, because vectorizer does'nt accept list.

In [11]:
X_train.shape

(29530,)

In [12]:
y_train.shape

(29530,)

In [13]:
X_test.shape

(17197,)

# Transforming Dataset using TF-IDF Vectorizer

### Fitting the Count Vectorizer

### Fitting the TF-IDF Vectorizer

In [14]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

No. of feature_words:  186052


### Transforming the data using TF-IDF Vectorizer

In [15]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

# SVC Model since this model performed best during experiments:

In [16]:
svc = SVC()
hyperParam = {'C': [0.1, 1, 10, 100, 1000],'gamma': [1, 0.1, 0.01, 0.001],'kernel': ['rbf','linear','poly','sigmoid']}

gsv = GridSearchCV(svc,hyperParam,cv=5,verbose=1,n_jobs=-1)  # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
best_model = gsv.fit(X_train, y_train)                       # Training model with X_train and y_train
svc_pred = best_model.predict(X_test)                        # Predicting the results

Fitting 5 folds for each of 80 candidates, totalling 400 fits


In [17]:
print("Best HyperParameter: ", gsv.best_params_)

Best HyperParameter:  {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}


### Creating Submission file:

In [18]:
print(svc_pred)
print(type(svc_pred))

my_array = svc_pred
print(len(my_array))

submission = pd.DataFrame(my_array,columns = ['label'])
submission['id'] = test['id']
submission = submission[['id','label']]
submission.to_csv('submission.csv', index=False)
submission

['0' '1' '0' ... '0' '0' '0']
<class 'numpy.ndarray'>
17197


Unnamed: 0,id,label
0,31963,0
1,31964,1
2,31965,0
3,31966,0
4,31967,0
...,...,...
17192,49155,1
17193,49156,0
17194,49157,0
17195,49158,0
