## importing library

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
train = pd.read_csv(r'C:\Users\User\Desktop\practice\train.csv')
test = pd.read_csv(r'C:\Users\User\Desktop\practice\test.csv')

In [3]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [4]:
test.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


## 1. function to remove links

In [5]:
def clean_link(text):
    clean = re.compile('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)')
    return re.sub(clean,'',text)

In [6]:
train['tweet'] = train['tweet'].apply(clean_link)

In [7]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test #android #apps #b...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


## 2. Function to convert into lower case

In [8]:
def convert_lower(text):
    return text.lower()

In [9]:
train['tweet'] = train['tweet'].apply(convert_lower)

In [10]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test #android #apps #b...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...



## 3. function to remove special character

In [11]:
def remove_special(text):
    x = ''
    
    for i in text:
        if i.isalnum():
            x = x+i
        else:
            x = x+' '
    return x

In [12]:
train['tweet'] = train['tweet'].apply(remove_special)

In [13]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps b...
1,2,0,finally a transparant silicon case thanks t...
2,3,0,we love this would you go talk makememorie...
3,4,0,i m wired i know i m george i was made that wa...
4,5,1,what amazing service apple won t even talk to...


## 4. removing stopwords

In [14]:
def remove_stopwords(text):
    x = []
    for i in text.split():
        
        if i not in stopwords.words('english'):
            x.append(i)
    y = x[:]
    x.clear()
    return y

In [15]:
train['tweet'] = train['tweet'].apply(remove_stopwords)

In [16]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,"[fingerprint, pregnancy, test, android, apps, ..."
1,2,0,"[finally, transparant, silicon, case, thanks, ..."
2,3,0,"[love, would, go, talk, makememories, unplug, ..."
3,4,0,"[wired, know, george, made, way, iphone, cute,..."
4,5,1,"[amazing, service, apple, even, talk, question..."


## 5. removing stem words

In [17]:
ps = PorterStemmer()

In [18]:
def stem_word(text):
    y = []
    for i in text:
        y.append(ps.stem(i))
    z = y[:]
    y.clear()
    return z

In [19]:
train['tweet'] = train['tweet'].apply(stem_word)

In [20]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,"[fingerprint, pregnanc, test, android, app, be..."
1,2,0,"[final, transpar, silicon, case, thank, uncl, ..."
2,3,0,"[love, would, go, talk, makememori, unplug, re..."
3,4,0,"[wire, know, georg, made, way, iphon, cute, da..."
4,5,1,"[amaz, servic, appl, even, talk, question, unl..."


## 6. joining back

In [21]:
def join_back(list_input):
    return " ".join(list_input)

In [22]:
train['tweet'] = train['tweet'].apply(join_back)

In [23]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test android app beauti c...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love would go talk makememori unplug relax iph...
3,4,0,wire know georg made way iphon cute daventri home
4,5,1,amaz servic appl even talk question unless pay...


## Vectorizing the data

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [25]:
X = cv.fit_transform(train['tweet']).toarray()

In [26]:
X.shape

(7920, 14789)

In [27]:
y = train.iloc[:,-2].values

In [28]:
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [29]:
y.shape

(7920,)

## Train test split

In [30]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [31]:
X_train.shape

(5544, 14789)

## Model building

In [32]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [33]:
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

In [34]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [35]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score

In [37]:
print('Gaussian',accuracy_score(y_test,y_pred1))
print('Multinomial',accuracy_score(y_test,y_pred2))
print('Bernoulli',accuracy_score(y_test,y_pred3))

Gaussian 0.7988215488215489
Multinomial 0.8707912457912458
Bernoulli 0.8733164983164983


In [38]:
test['tweet'] = test['tweet'].apply(clean_link)
test['tweet'] = test['tweet'].apply(convert_lower)
test['tweet'] = test['tweet'].apply(remove_special)
test['tweet'] = test['tweet'].apply(remove_stopwords)
test['tweet'] = test['tweet'].apply(stem_word)
test['tweet'] = test['tweet'].apply(join_back)
test1 = cv.transform(test['tweet']).toarray()

In [39]:
test1.shape

(1953, 14789)

In [40]:
y_pred = clf2.predict(test1)

In [41]:
y_pred = pd.DataFrame(y_pred)

In [42]:
y_pred.shape

(1953, 1)

In [43]:
final = pd.concat([test['id'],y_pred],axis = 1)

In [44]:
final.rename(columns={0:'label'},inplace=True)

In [45]:
final.to_csv('final.csv',index = False)