In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train_tweets=pd.read_csv('../data/Dataset 1/clean_train_tweets.csv', encoding="utf-8")
train_tweets

Unnamed: 0,id,label,tweet,length,count
0,1,0,father dysfunctional selfish drags kids dysfun...,55,7
1,2,0,thanks lyft credit use cause offer wheelchair ...,77,11
2,3,0,bihday majesty,14,2
3,4,0,model love u take u time ur,27,7
4,5,0,factsguide society motivation,29,3
...,...,...,...,...,...
31925,31958,0,ate isz youuu,13,3
31926,31959,0,see nina turner airwaves trying wrap mantle ge...,93,14
31927,31960,0,listening sad songs monday morning otw work sad,47,8
31928,31961,1,sikh temple vandalised calgary wso condemns act,47,7


### CountVectorizer

In [3]:
#using scikit-learn to transform text into token count vector

from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(
    token_pattern = r"[a-z]+", 
    ngram_range = (1,1), #ngram_range (1,1)= only unigrams, (1,2)=unigrams and bigrams, (2,2)=bigrams
    lowercase = True,
    min_df = 1, #min_df=1 is the default, means ignore terms that appear in less than 1 document/text.
    max_df = 1.0 #max_df=1.0 is the default, means ignore terms that appear in more than 100% of the documents/texts.
)

### Splitting the train dataset into train and development

In [4]:
#to test perfomarnce against the development set, we can split the training dataset into train and dev

from sklearn.model_selection import train_test_split

In [5]:
#15% of train_tweets will be in dev
train, dev= train_test_split(train_tweets, test_size=0.15, random_state=42)

In [6]:
X_train = train['tweet'].values
X_train_vect = count_vector.fit_transform(X_train) #fitting CountVectorizer, transforms trainging data into 
                                                    #matrix representing token counts 
X_train_vect

<27140x34039 sparse matrix of type '<class 'numpy.int64'>'
	with 200746 stored elements in Compressed Sparse Row format>

In [7]:
#setting up a PredefinedSplit

X_train = train['tweet'].values
y_train = train['label'].values

X_dev = dev['tweet'].values
y_dev = dev['label'].values

X = np.hstack([X_train, X_dev])
y = np.hstack([y_train, y_dev])

In [8]:
#assign 0 to items that are in dev and -1 for the rest
split_train_dev= np.zeros(shape=y.shape)
split_train_dev[:y_train.shape[0]] = -1
pd.value_counts(split_train_dev)

-1.0    27140
 0.0     4790
dtype: int64

### Random Forest

In [9]:
from sklearn.model_selection import PredefinedSplit,GridSearchCV

ps = PredefinedSplit(split_train_dev)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vect_1 = CountVectorizer(
    token_pattern = r"[a-z]+", 
    ngram_range = (1,1),
    lowercase = True,
    min_df = 1,
    max_df = 1.0
)

vect_2 = TfidfVectorizer(
    token_pattern = r"[a-z]+", 
    ngram_range = (1,1),
    lowercase = True,
    min_df = 1,
    max_df = 1.0
)

select = SelectPercentile(score_func=chi2)

clf = RandomForestClassifier(n_jobs=2)

pipe = Pipeline([("vect", vect_1), ("select", select), ("clf", clf)])

In [11]:
param_grid = {
    'vect':[vect_1, vect_2],
    'vect__ngram_range':[(1,1), (1,2), (1,3)],
    'vect__min_df':[1, 2, 5, 10, 20],
    'select__percentile':[1, 2, 5, 10, 20, 50],
    'clf__n_estimators':[10, 20, 50, 100],
    'clf__max_depth':[1, 2, 5, 10],
    'clf__class_weight':[None, 'balanced', 'balanced_subsample']
}


rs = RandomizedSearchCV(pipe, param_grid, n_iter=30, scoring='f1', n_jobs=3, cv=ps, verbose=2)
rs.fit(X, y)
print(rs.best_params_)
print(rs.best_score_)

Fitting 1 folds for each of 30 candidates, totalling 30 fits
{'vect__ngram_range': (1, 1), 'vect__min_df': 10, 'vect': TfidfVectorizer(min_df=10, token_pattern='[a-z]+'), 'select__percentile': 50, 'clf__n_estimators': 100, 'clf__max_depth': 2, 'clf__class_weight': 'balanced'}
0.5121602288984264


In [12]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = rs.best_estimator_.predict(X_dev)
confusion_matrix(y_dev, y_pred)

#confusion matrix for random forest
                                     #TP 4251 -- FP 176  #positives are non_offensive
                                    # FN 182  -- TN 181  #negatives are offensive
  

array([[4251,  176],
       [ 182,  181]])

In [13]:
print(classification_report(y_dev, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      4427
           1       0.51      0.50      0.50       363

    accuracy                           0.93      4790
   macro avg       0.73      0.73      0.73      4790
weighted avg       0.92      0.93      0.92      4790



In [14]:
np.mean(y_dev==y_pred)

0.9252609603340293