In [84]:
# Loading the required Libraries

import numpy as np
import glob
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.vq import whiten
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word
stop = stopwords.words('english')

In [70]:
# Reading file 

rawData = pd.read_csv('train_tweets.txt',sep='\t',names=('Id','tweet'))

#print(rawData)

In [77]:
#Basic Feature Extraction

#Number of Words

rawData['word_count'] = rawData['tweet'].apply(lambda x: len(str(x).split(" ")))
rawData[['tweet','word_count']].head()

#Number of characters

rawData['char_count'] = rawData['tweet'].str.len() ## this also includes spaces
rawData[['tweet','char_count']].head()


#Average Length 

def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

rawData['avg_word'] = rawData['tweet'].apply(lambda x: avg_word(x))
rawData[['tweet','avg_word']].head()

#Number of StopWords



rawData['stopwords'] = rawData['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
rawData[['tweet','stopwords']].head()

#Number of Special Charaters 

rawData['hastags'] = rawData['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
rawData[['tweet','hastags']].head()

#Numbe rof Numerics 

rawData['numerics'] = rawData['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
rawData[['tweet','numerics']].head()

# Number of Upper Case 

rawData['upper'] = rawData['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
rawData[['tweet','upper']].head()

#print(rawData)


          Id                                              tweet  word_count  \
0       8746     @handle Let's try and catch up live next week!           9   
1       8746  Going to watch Grey's on the big screen - Thur...          11   
2       8746  @handle My pleasure Patrick....hope you are well!           7   
3       8746  @handle Hi there! Been traveling a lot and lot...          27   
4       8746  RT @handle Looking to Drink Clean & Go Green? ...          19   
5       8746  RT @handle: Ft. Hood officials confirm the 2 o...          17   
6       8746  RT @handle: Mickey Mouse is Getting a Make Ove...          11   
7       8746           @handle How did u get the invite Justin?           8   
8       8746  @handle I think I am still a good friend of he...          13   
9       8746  @handle I remember! I am fine - how are u? Wha...          12   
10      8746     @handle That's great - good for the coach!!!!!           8   
11      8746  @handle I don't want to picture u sitt

In [92]:
#Basic Pre-processing

#Lower case

rawData['tweet'] = rawData['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
rawData['tweet'].head()

#Punctuation removal

rawData['tweet'] = rawData['tweet'].str.replace('[^\w\s]','')
rawData['tweet'].head()

#Stopwords removal

rawData['tweet'] = rawData['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
rawData['tweet'].head()

#Frequent words removal


freq = pd.Series(' '.join(rawData['tweet']).split()).value_counts()[:10]
    
freq = list(freq.index)
rawData['tweet'] = rawData['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
rawData['tweet'].head()

#Rare words removal


freq = pd.Series(' '.join(rawData['tweet']).split()).value_counts()[-10:]

#Spelling correction


freq = list(freq.index)
rawData['tweet'] = rawData['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
rawData['tweet'].head()

#Tokenization

rawData['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

WordList=[]
TextBlob(rawData['tweet'][1]).words
WordList=['thanks', 'lyft', 'credit', 'cant', 'use', 'cause', 'dont', 'offer', 'wheelchair', 'vans', 'pdx', 'disapointed', 'getthanked']

#Stemming


st = PorterStemmer()
rawData['tweet'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

#Lemmatization


rawData['tweet'] = rawData['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
rawData['tweet'].head()

print(rawData)


          Id                                              tweet  word_count  \
0       8746                                          try catch           9   
1       8746                    grey screen thursday indulgence          11   
2       8746                               pleasure patrickhope           7   
3       8746   hi traveling lot lot month recovered pneumonia r          27   
4       8746  drink clean green purchase clear2go bottle w f...          19   
5       8746  ft hood official confirm soldier initially hel...          17   
6       8746                       mickey mouse httpbitly1ustfu          11   
7       8746                                      invite justin           8   
8       8746                                                             13   
9       8746                                remember fine whats          12   
10      8746                                              coach           8   
11      8746                         picture sitting

In [None]:

# ANN Libraries 


from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Model of ANN

num_dataset = rawData

#print(dataset)
#names = ['authorID','NumLink','hastags','spCount','word_count','char_count','isCapitalize','RT_count']


X = num_dataset.iloc[:, 2:].values
y = num_dataset.iloc[:,0].values


encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)
encoded_Y

X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.20,random_state=1000)

input_dim = X_train.shape[1]
model = Sequential()
model.add(Dense(32, input_dim=input_dim, activation='relu')) # input dimension = dimension of festure vector
model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(9293, activation='softmax')) # output layer = no. of classes
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])


batch_size = 16
epochs = 30
history = model.fit(X_train,y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test,y_test))

Train on 262556 samples, validate on 65639 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30