In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("/content/train.csv")
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
train["label"].value_counts()

0    20109
1     1493
Name: label, dtype: int64

In [4]:
test = pd.read_csv("/content/test.csv")
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


# **Preprocess**

In [5]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [6]:
stop_words = set(stopwords.words("english"))

In [7]:
def  cleaning_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))
    return df

In [8]:
#Cleaning the test and training dataset
test_clean  = cleaning_text(test, "tweet")
train_clean = cleaning_text(train,"tweet")

In [10]:
train_clean.head()

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for lyft credit i cant use cause they...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in u...
4,5,0,factsguide society now motivation


#Resampling


In [12]:
train_clean["label"].value_counts()

0    20109
1     1493
Name: label, dtype: int64

In [13]:
from sklearn.utils import resample

In [14]:
train_major = train_clean[train_clean.label==0]
train_minor = train_clean[train_clean.label==1]
#Here we are upsampling the data
train_minor_upsampled = resample(train_minor,replace=True,n_samples=len(train_major),random_state=123)
train_upsampled = pd.concat([train_minor_upsampled, train_major])
#The upsampled data
train_upsampled['label'].value_counts()

1    20109
0    20109
Name: label, dtype: int64

# Modeling and Pipelining

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier


In [16]:
#Here we are building the pipeline of Scikit learn SGDClassifier
pipeline_SGD = Pipeline([('vect', CountVectorizer()),('tfidf',  TfidfTransformer()),('nb', SGDClassifier()),])

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'],train_upsampled['label'],random_state = 0)

In [19]:
#Training our model
model = pipeline_SGD.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [21]:
from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(y_pred,y_test))

Accuracy Score: 0.9718547986076579


In [22]:
pd.DataFrame(data={'Actual_class':y_test,'Predicted_class':y_pred})

Unnamed: 0,Actual_class,Predicted_class
6824,0,0
5384,0,0
14528,1,1
6882,1,1
13790,0,0
...,...,...
12598,0,0
9388,0,0
4029,0,0
18635,1,1
