In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#loading train.csv
df = pd.read_csv('train.csv')

In [3]:
#removing the urls from df
df['tweet'] = df['tweet'].str.replace('http\S+','')

In [4]:
#adding total number of words as features
df['word_count'] = df['tweet'].apply(lambda x: len(str(x).split(" ")))
df[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
0,#fingerprint #Pregnancy Test #android #apps #...,13
1,Finally a transparant silicon case ^^ Thanks t...,17
2,We love this! Would you go? #talk #makememorie...,15
3,I'm wired I know I'm George I was made that wa...,17
4,What amazing service! Apple won't even talk to...,23


In [5]:
#adding total number of characters as a new feature column 
df['char_count'] = df['tweet'].str.len() ## this also includes spaces
df[['tweet','char_count']].head()

Unnamed: 0,tweet,char_count
0,#fingerprint #Pregnancy Test #android #apps #...,107
1,Finally a transparant silicon case ^^ Thanks t...,97
2,We love this! Would you go? #talk #makememorie...,101
3,I'm wired I know I'm George I was made that wa...,81
4,What amazing service! Apple won't even talk to...,124


In [6]:
#adding average number of words as feature column
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

df['avg_word'] = df['tweet'].apply(lambda x: avg_word(x))
df[['tweet','avg_word']].head()

Unnamed: 0,tweet,avg_word
0,#fingerprint #Pregnancy Test #android #apps #...,7.916667
1,Finally a transparant silicon case ^^ Thanks t...,5.0625
2,We love this! Would you go? #talk #makememorie...,6.214286
3,I'm wired I know I'm George I was made that wa...,4.0625
4,What amazing service! Apple won't even talk to...,4.434783


In [7]:
#adding total number of stop words as features
from nltk.corpus import stopwords
stop = stopwords.words('english')

df['stopwords'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
df[['tweet','stopwords']].head()

Unnamed: 0,tweet,stopwords
0,#fingerprint #Pregnancy Test #android #apps #...,0
1,Finally a transparant silicon case ^^ Thanks t...,3
2,We love this! Would you go? #talk #makememorie...,1
3,I'm wired I know I'm George I was made that wa...,2
4,What amazing service! Apple won't even talk to...,9


In [8]:
#adding total number of words starting with hash 
df['hastags'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df[['tweet','hastags']].head()

Unnamed: 0,tweet,hastags
0,#fingerprint #Pregnancy Test #android #apps #...,11
1,Finally a transparant silicon case ^^ Thanks t...,5
2,We love this! Would you go? #talk #makememorie...,8
3,I'm wired I know I'm George I was made that wa...,4
4,What amazing service! Apple won't even talk to...,0


In [9]:
#adding total number of numeric characters as features
df['numerics'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df[['tweet','numerics']].head()

Unnamed: 0,tweet,numerics
0,#fingerprint #Pregnancy Test #android #apps #...,0
1,Finally a transparant silicon case ^^ Thanks t...,0
2,We love this! Would you go? #talk #makememorie...,0
3,I'm wired I know I'm George I was made that wa...,0
4,What amazing service! Apple won't even talk to...,0


In [10]:
#adding total number of capslock words as features
df['upper'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
df[['tweet','upper']].head()

Unnamed: 0,tweet,upper
0,#fingerprint #Pregnancy Test #android #apps #...,0
1,Finally a transparant silicon case ^^ Thanks t...,1
2,We love this! Would you go? #talk #makememorie...,0
3,I'm wired I know I'm George I was made that wa...,2
4,What amazing service! Apple won't even talk to...,2


In [11]:
#converting capslock characters to lower letters
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['tweet'].head()

0    #fingerprint #pregnancy test #android #apps #b...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [12]:
#removing special characters
df['tweet'] = df['tweet'].str.replace('[^\w\s]','')
df['tweet'].head()

0    fingerprint pregnancy test android apps beauti...
1    finally a transparant silicon case  thanks to ...
2    we love this would you go talk makememories un...
3    im wired i know im george i was made that way ...
4    what amazing service apple wont even talk to m...
Name: tweet, dtype: object

In [13]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['tweet'].head()

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    love would go talk makememories unplug relax i...
3    im wired know im george made way iphone cute d...
4    amazing service apple wont even talk question ...
Name: tweet, dtype: object

In [14]:
freq = pd.Series(' '.join(df['tweet']).split()).value_counts()[:10]
freq

iphone     3757
apple      2854
samsung    1406
new        1141
phone       955
sony        849
follow      723
ipad        510
like        429
love        426
dtype: int64

In [15]:
#removing first 10 highest occuring words
freq = list(freq.index)
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['tweet'].head()

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    would go talk makememories unplug relax smartp...
3    im wired know im george made way cute daventry...
4    amazing service wont even talk question unless...
Name: tweet, dtype: object

In [16]:
#finding out lowest ten occuring words
freq = pd.Series(' '.join(df['tweet']).split()).value_counts()[-10:]
freq

tecnologies      1
poems            1
imaginary        1
sues             1
googlemobile     1
skiers           1
sight            1
livewallpaper    1
wining           1
floodalert       1
dtype: int64

In [17]:
#removing lowest 10 occuring words
freq = list(freq.index)
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['tweet'].head()

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    would go talk makememories unplug relax smartp...
3    im wired know im george made way cute daventry...
4    amazing service wont even talk question unless...
Name: tweet, dtype: object

In [18]:
#lemmatizing words in their base forms
from textblob import Word
df['tweet'] = df['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['tweet'].head()

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    would go talk makememories unplug relax smartp...
3    im wired know im george made way cute daventry...
4    amazing service wont even talk question unless...
Name: tweet, dtype: object

In [19]:
#loading the test file
ds = pd.read_csv('test.csv')

In [20]:
#removing URLs from ds
ds['tweet'] = ds['tweet'].str.replace('http\S+','')

In [21]:
#adding number of words as features
ds['word_count'] = ds['tweet'].apply(lambda x: len(str(x).split(" ")))
ds[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
0,I hate the new #iphone upgrade. Won't let me d...,14
1,currently shitting my fucking pants. #apple #i...,11
2,"I'd like to puts some CD-ROMS on my iPad, is t...",20
3,My ipod is officially dead. I lost all my pict...,23
4,Been fighting iTunes all night! I only want th...,14


In [22]:
#adding total numbe of characters as features
ds['char_count'] = ds['tweet'].str.len() ## this also includes spaces
ds[['tweet','char_count']].head()

Unnamed: 0,tweet,char_count
0,I hate the new #iphone upgrade. Won't let me d...,77
1,currently shitting my fucking pants. #apple #i...,84
2,"I'd like to puts some CD-ROMS on my iPad, is t...",104
3,My ipod is officially dead. I lost all my pict...,129
4,Been fighting iTunes all night! I only want th...,70


In [23]:
#adding average number of words as features
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

ds['avg_word'] = ds['tweet'].apply(lambda x: avg_word(x))
ds[['tweet','avg_word']].head()

Unnamed: 0,tweet,avg_word
0,I hate the new #iphone upgrade. Won't let me d...,4.571429
1,currently shitting my fucking pants. #apple #i...,7.4
2,"I'd like to puts some CD-ROMS on my iPad, is t...",4.2
3,My ipod is officially dead. I lost all my pict...,4.652174
4,Been fighting iTunes all night! I only want th...,4.071429


In [24]:
#finding out total number of stopwords in a sentence
from nltk.corpus import stopwords
stop = stopwords.words('english')

ds['stopwords'] = ds['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
ds[['tweet','stopwords']].head()

Unnamed: 0,tweet,stopwords
0,I hate the new #iphone upgrade. Won't let me d...,2
1,currently shitting my fucking pants. #apple #i...,1
2,"I'd like to puts some CD-ROMS on my iPad, is t...",10
3,My ipod is officially dead. I lost all my pict...,8
4,Been fighting iTunes all night! I only want th...,4


In [25]:
#adding number of words staring with hashtags as features
ds['hastags'] = ds['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
ds[['tweet','hastags']].head()

Unnamed: 0,tweet,hastags
0,I hate the new #iphone upgrade. Won't let me d...,3
1,currently shitting my fucking pants. #apple #i...,5
2,"I'd like to puts some CD-ROMS on my iPad, is t...",0
3,My ipod is officially dead. I lost all my pict...,2
4,Been fighting iTunes all night! I only want th...,0


In [26]:
#adding number of numeric numbers as features
ds['numerics'] = ds['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
ds[['tweet','numerics']].head()

Unnamed: 0,tweet,numerics
0,I hate the new #iphone upgrade. Won't let me d...,0
1,currently shitting my fucking pants. #apple #i...,0
2,"I'd like to puts some CD-ROMS on my iPad, is t...",0
3,My ipod is officially dead. I lost all my pict...,0
4,Been fighting iTunes all night! I only want th...,0


In [27]:
#adding number of of capslock words as features
ds['upper'] = ds['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
ds[['tweet','upper']].head()

Unnamed: 0,tweet,upper
0,I hate the new #iphone upgrade. Won't let me d...,1
1,currently shitting my fucking pants. #apple #i...,0
2,"I'd like to puts some CD-ROMS on my iPad, is t...",1
3,My ipod is officially dead. I lost all my pict...,2
4,Been fighting iTunes all night! I only want th...,2


In [28]:
#converting all letters as lowerclass letters
ds['tweet'] = ds['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
ds['tweet'].head()

0    i hate the new #iphone upgrade. won't let me d...
1    currently shitting my fucking pants. #apple #i...
2    i'd like to puts some cd-roms on my ipad, is t...
3    my ipod is officially dead. i lost all my pict...
4    been fighting itunes all night! i only want th...
Name: tweet, dtype: object

In [29]:
#removing special characters
ds['tweet'] = ds['tweet'].str.replace('[^\w\s]','')
ds['tweet'].head()

0    i hate the new iphone upgrade wont let me down...
1    currently shitting my fucking pants apple imac...
2    id like to puts some cdroms on my ipad is that...
3    my ipod is officially dead i lost all my pictu...
4    been fighting itunes all night i only want the...
Name: tweet, dtype: object

In [30]:
#removing stopwords from sentence
ds['tweet'] = ds['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
ds['tweet'].head()

0    hate new iphone upgrade wont let download apps...
1    currently shitting fucking pants apple imac ca...
2    id like puts cdroms ipad possible yes wouldnt ...
3    ipod officially dead lost pictures videos 1d 5...
4                fighting itunes night want music paid
Name: tweet, dtype: object

In [31]:
#finding out 10 most occuring words in the ds dataframe
freq = pd.Series(' '.join(ds['tweet']).split()).value_counts()[:10]
freq

iphone     861
apple      729
samsung    354
new        290
phone      228
sony       202
follow     159
ipad       115
love       113
like       107
dtype: int64

In [32]:
#removing 10 most occuring words from ds dataframe
freq = list(freq.index)
ds['tweet'] = ds['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
ds['tweet'].head()

0        hate upgrade wont let download apps ugh sucks
1    currently shitting fucking pants imac cashmone...
2     id puts cdroms possible yes wouldnt block screen
3    ipod officially dead lost pictures videos 1d 5...
4                fighting itunes night want music paid
Name: tweet, dtype: object

In [33]:
#finding out 10 least occuring words in ds dataframe
freq = pd.Series(' '.join(df['tweet']).split()).value_counts()[-10:]
freq

issuesapple       1
fp                1
djesminc          1
nightlife         1
et                1
366               1
gamerguy          1
shae              1
yeeeeh            1
havefunwithher    1
dtype: int64

In [34]:
#removing 10 least occuring words from ds dataframe
freq = list(freq.index)
ds['tweet'] = ds['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
ds['tweet'].head()

0        hate upgrade wont let download apps ugh sucks
1    currently shitting fucking pants imac cashmone...
2     id puts cdroms possible yes wouldnt block screen
3    ipod officially dead lost pictures videos 1d 5...
4                fighting itunes night want music paid
Name: tweet, dtype: object

In [35]:
#lemmetizing words to their base form
from textblob import Word
ds['tweet'] = ds['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
ds['tweet'].head()

0         hate upgrade wont let download apps ugh suck
1    currently shitting fucking pant imac cashmoney...
2      id put cdroms possible yes wouldnt block screen
3    ipod officially dead lost picture video 1d 5so...
4                fighting itunes night want music paid
Name: tweet, dtype: object

In [36]:
ds.head()

Unnamed: 0,id,tweet,word_count,char_count,avg_word,stopwords,hastags,numerics,upper
0,7921,hate upgrade wont let download apps ugh suck,14,77,4.571429,2,3,0,1
1,7922,currently shitting fucking pant imac cashmoney...,11,84,7.4,1,5,0,0
2,7923,id put cdroms possible yes wouldnt block screen,20,104,4.2,10,0,0,1
3,7924,ipod officially dead lost picture video 1d 5so...,23,129,4.652174,8,2,0,2
4,7925,fighting itunes night want music paid,14,70,4.071429,4,0,0,2


In [37]:
df.head()

Unnamed: 0,id,label,tweet,word_count,char_count,avg_word,stopwords,hastags,numerics,upper
0,1,0,fingerprint pregnancy test android apps beauti...,13,107,7.916667,0,11,0,0
1,2,0,finally transparant silicon case thanks uncle ...,17,97,5.0625,3,5,0,1
2,3,0,would go talk makememories unplug relax smartp...,15,101,6.214286,1,8,0,0
3,4,0,im wired know im george made way cute daventry...,17,81,4.0625,2,4,0,2
4,5,1,amazing service wont even talk question unless...,23,124,4.434783,9,0,0,2


In [38]:
#converting tweets in tfidf features with constraint of max number of features set to 10000
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)
train_vectors = vectorizer.fit_transform(df.loc[:, 'tweet'].values)
test_vectors = vectorizer.transform(ds.loc[:, 'tweet'].values)
print(train_vectors.shape, test_vectors.shape)

(7920, 10000) (1953, 10000)


In [39]:
new_mat1 = df[['word_count', 'char_count', 'avg_word', 'stopwords', 'hastags', 'numerics', 'upper']].values.tolist()
new_mat2 = ds[['word_count', 'char_count', 'avg_word', 'stopwords', 'hastags', 'numerics', 'upper']].values.tolist()
new_mat1 = np.array(new_mat1)
new_mat2 = np.array(new_mat2)
#new_mat.shape

In [40]:
train_vectors1 = train_vectors
test_vectors1 = test_vectors
#np.append(train_vectors1, np.array(new_mat), axis = 0)


In [41]:
#print(np.array(new_mat).shape)
print(train_vectors1.shape)
print(test_vectors1.shape)

(7920, 10000)
(1953, 10000)


In [42]:
from scipy import sparse
train_vectors1 = sparse.hstack((train_vectors1,new_mat1)).A
test_vectors1 = sparse.hstack((test_vectors1,new_mat2)).A

In [43]:
print(train_vectors1.shape)
print(test_vectors1.shape)

(7920, 10007)
(1953, 10007)


In [44]:
y_train = df.iloc[:,1]
y_test = ds.iloc[:,1]

In [45]:
#Splitting train and test values
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(train_vectors, y_train, test_size = 0.2, random_state = 0)

In [47]:
#Using multinomial naieve bayes as binary classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=.4).fit(xTrain, yTrain)

In [48]:
#calculating accuracy score on xTest
from  sklearn.metrics  import accuracy_score
predicted = clf.predict(xTest)
print(accuracy_score(yTest,predicted))

0.8857323232323232


In [49]:
#training the classifier on the total train set and predicting on the whole test set
clf = MultinomialNB(alpha=.4).fit(train_vectors, y_train)
predicted = clf.predict(test_vectors)

In [50]:
#creating dbObj dataframe with predicted results
dfObj = pd.DataFrame(predicted.tolist(), columns = ['label'])

In [51]:
#creating a csv file with name set to predicted.csv
dfObj.to_csv('predicted.csv', sep='\t', index = False)