In [68]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


%matplotlib inline

In [69]:
train = pd.read_csv('train.csv')

In [70]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [71]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
id       31962 non-null int64
label    31962 non-null int64
tweet    31962 non-null object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [72]:
train.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

# Data Cleaning

In [73]:
import re
def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())

In [74]:
train['ctweets']=train['tweet'].apply(process_tweet)

In [75]:
train.head()

Unnamed: 0,id,label,tweet,ctweets
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i can t use cause they ...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now motivation


In [76]:
def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)



drop_features(['id','tweet'],train)

In [77]:
train.head()

Unnamed: 0,label,ctweets
0,0,when a father is dysfunctional and is so selfi...
1,0,thanks for lyft credit i can t use cause they ...
2,0,bihday your majesty
3,0,model i love u take with u all the time in ur
4,0,factsguide society now motivation


In [78]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(train["ctweets"],train["label"],test_size=0.2,random_state=42)

In [79]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [80]:
count_vect = CountVectorizer(stop_words='english')
transformer =TfidfTransformer(norm='l2',sublinear_tf=True)

In [81]:
x_train_counts=count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)


In [82]:
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

In [83]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200)
model.fit(x_train_tfidf,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [84]:
predictions = model.predict(x_test_tfidf)

In [85]:
from sklearn.metrics import confusion_matrix,f1_score
confusion_matrix(y_test,predictions)

array([[5904,   33],
       [ 216,  240]], dtype=int64)

In [86]:
f1_score(y_test,predictions)

0.6584362139917695

# Preparing Solution

In [87]:
test = pd.read_csv('test.csv')

In [88]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
id       17197 non-null int64
tweet    17197 non-null object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB


In [89]:
test['ctweet']=test['tweet'].apply(process_tweet)

In [90]:
test.head()

Unnamed: 0,id,tweet,ctweet
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...
1,31964,@user #white #supremacists want everyone to s...,white supremacists want everyone to see the ne...
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystoheal heal...
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd bihday to my amazing hilarious nephew eli ...


In [91]:
drop_features(['tweet'],test)

In [92]:
train_counts = count_vect.fit_transform(train['ctweets'])
test_counts = count_vect.transform(test['ctweet'])

In [93]:
train_tfidf = transformer.fit_transform(train_counts)
test_tfidf = transformer.transform(test_counts)

In [94]:
model.fit(train_tfidf,train['label'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [95]:
predictions = model.predict(test_tfidf)

In [96]:
final_result = pd.DataFrame({'id':test['id'],'label':predictions})
final_result.to_csv('output.csv',index=False)