In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


In [36]:
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
ss = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')

In [37]:
print(train.shape)
print(test.shape)

(27481, 4)
(3534, 3)


In [38]:
train.dropna(inplace=True)

Explorative Data Analysis

In [39]:
train['text']=train['text'].str.upper()

In [40]:
def clean_text(text):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    return text.lower()
    return re.sub(r'\d+', '', text) 
    return  " ".join(text.split()) # remove white space

In [41]:
import string
train['text'] = train['text'].apply(lambda x:''.join([i for i in x 
                                                  if i not in string.punctuation]))

In [42]:
train['text']=train['text'].apply(lambda X:clean_text(str(X)))

In [43]:
train

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,id have responded if i were going,"I`d have responded, if I were going",neutral
1,549e992a42,sooo sad i will miss you here in san diego,Sooo SAD,negative
2,088c60f138,my boss is bullying me,bullying me,negative
3,9642c003ef,what interview leave me alone,leave me alone,negative
4,358bd9e861,sons of why couldnt they put them on the rel...,"Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on denver husband l...,d lost,negative
27477,4f4c4fc327,ive wondered about rake to the client has ma...,", don`t force",negative
27478,f67aae2310,yay good for both of you enjoy the break you...,Yay good for both of you.,positive
27479,ed167662a5,but it was worth it,But it was worth it ****.,positive


Remove default stopwords

In [44]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
# remove stopwords function 
def remove_stopwords(text): 
    stop_words = set(stopwords.words("english")) 
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    return filtered_text 
  


In [45]:
train['stop_words']=train['text'].apply(lambda x:remove_stopwords(str(x)))

**Stemming**:
Stemming is the process of getting the root form of a word. Stem or root is the part to which inflectional affixes (-ed, -ize, -de, -s, etc.) are added. The stem of a word is created by removing the prefix or suffix of a word. So, stemming a word may not result in actual words.

In [46]:
from nltk.stem.porter import PorterStemmer 
from nltk.tokenize import word_tokenize 
stemmer = PorterStemmer() 



In [47]:
def stem_words(text): 
    word_tokens = word_tokenize(text) 
    stems = [stemmer.stem(word) for word in word_tokens] 
    return stems 

In [48]:
train['stemming']=train['stop_words'].apply(lambda x:stem_words(str(x)))

In [49]:
train

Unnamed: 0,textID,text,selected_text,sentiment,stop_words,stemming
0,cb774db0d1,id have responded if i were going,"I`d have responded, if I were going",neutral,"[id, responded, going]","[[, 'id, ', ,, 'respond, ', ,, 'go, ', ]]"
1,549e992a42,sooo sad i will miss you here in san diego,Sooo SAD,negative,"[sooo, sad, miss, san, diego]","[[, 'sooo, ', ,, 'sad, ', ,, 'miss, ', ,, 'san..."
2,088c60f138,my boss is bullying me,bullying me,negative,"[boss, bullying]","[[, 'boss, ', ,, 'bulli, ', ]]"
3,9642c003ef,what interview leave me alone,leave me alone,negative,"[interview, leave, alone]","[[, 'interview, ', ,, 'leav, ', ,, 'alon, ', ]]"
4,358bd9e861,sons of why couldnt they put them on the rel...,"Sons of ****,",negative,"[sons, couldnt, put, releases, already, bought]","[[, 'son, ', ,, 'couldnt, ', ,, 'put, ', ,, 'r..."
...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on denver husband l...,d lost,negative,"[wish, could, come, see, u, denver, husband, l...","[[, 'wish, ', ,, 'could, ', ,, 'come, ', ,, 's..."
27477,4f4c4fc327,ive wondered about rake to the client has ma...,", don`t force",negative,"[ive, wondered, rake, client, made, clear, net...","[[, 'ive, ', ,, 'wonder, ', ,, 'rake, ', ,, 'c..."
27478,f67aae2310,yay good for both of you enjoy the break you...,Yay good for both of you.,positive,"[yay, good, enjoy, break, probably, need, hect...","[[, 'yay, ', ,, 'good, ', ,, 'enjoy, ', ,, 'br..."
27479,ed167662a5,but it was worth it,But it was worth it ****.,positive,[worth],"[[, 'worth, ', ]]"


**Lemmatization:**************
Like stemming, lemmatization also converts a word to its root form. The only difference is that lemmatization ensures that the root word belongs to the language. We will get valid words if we use lemmatization. In NLTK, we use the WordNetLemmatizer to get the lemmas of words. We also need to provide a context for the lemmatization. So, we add the part-of-speech as a parameter.

In [50]:
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
lemmatizer = WordNetLemmatizer() 
# lemmatize string 
def lemmatize_word(text): 
    word_tokens = word_tokenize(text) 
    # provide context i.e. part-of-speech 
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens] 
    return lemmas

In [51]:
train['lemmatize']=train['stop_words'].apply(lambda x:lemmatize_word(str(x)))

In [52]:
train

Unnamed: 0,textID,text,selected_text,sentiment,stop_words,stemming,lemmatize
0,cb774db0d1,id have responded if i were going,"I`d have responded, if I were going",neutral,"[id, responded, going]","[[, 'id, ', ,, 'respond, ', ,, 'go, ', ]]","[[, 'id, ', ,, 'responded, ', ,, 'going, ', ]]"
1,549e992a42,sooo sad i will miss you here in san diego,Sooo SAD,negative,"[sooo, sad, miss, san, diego]","[[, 'sooo, ', ,, 'sad, ', ,, 'miss, ', ,, 'san...","[[, 'sooo, ', ,, 'sad, ', ,, 'miss, ', ,, 'san..."
2,088c60f138,my boss is bullying me,bullying me,negative,"[boss, bullying]","[[, 'boss, ', ,, 'bulli, ', ]]","[[, 'boss, ', ,, 'bullying, ', ]]"
3,9642c003ef,what interview leave me alone,leave me alone,negative,"[interview, leave, alone]","[[, 'interview, ', ,, 'leav, ', ,, 'alon, ', ]]","[[, 'interview, ', ,, 'leave, ', ,, 'alone, ', ]]"
4,358bd9e861,sons of why couldnt they put them on the rel...,"Sons of ****,",negative,"[sons, couldnt, put, releases, already, bought]","[[, 'son, ', ,, 'couldnt, ', ,, 'put, ', ,, 'r...","[[, 'sons, ', ,, 'couldnt, ', ,, 'put, ', ,, '..."
...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on denver husband l...,d lost,negative,"[wish, could, come, see, u, denver, husband, l...","[[, 'wish, ', ,, 'could, ', ,, 'come, ', ,, 's...","[[, 'wish, ', ,, 'could, ', ,, 'come, ', ,, 's..."
27477,4f4c4fc327,ive wondered about rake to the client has ma...,", don`t force",negative,"[ive, wondered, rake, client, made, clear, net...","[[, 'ive, ', ,, 'wonder, ', ,, 'rake, ', ,, 'c...","[[, 'ive, ', ,, 'wondered, ', ,, 'rake, ', ,, ..."
27478,f67aae2310,yay good for both of you enjoy the break you...,Yay good for both of you.,positive,"[yay, good, enjoy, break, probably, need, hect...","[[, 'yay, ', ,, 'good, ', ,, 'enjoy, ', ,, 'br...","[[, 'yay, ', ,, 'good, ', ,, 'enjoy, ', ,, 'br..."
27479,ed167662a5,but it was worth it,But it was worth it ****.,positive,[worth],"[[, 'worth, ', ]]","[[, 'worth, ', ]]"


In [53]:
from sklearn.model_selection import train_test_split
X=train['text']
Y=train['sentiment']


In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
vectorizer = CountVectorizer()

In [55]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.33, random_state=42)

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', decode_error='ignore')
vectorizer.fit(X_train)
X_train=vectorizer.transform(X_train)
X_test=vectorizer.transform(X_test)

In [62]:
# Logistic Regresion
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='liblinear')
model.fit(X_train,Y_train)
print("Score on training data is: "+str(model.score(X_train,Y_train)))
print("Score on testing data is: "+str(model.score(X_test,Y_test)))

Score on training data is: 0.7880071696268535
Score on testing data is: 0.6828757305105304


In [68]:
import numpy as np
# Here 0 denotes a negative sentiment
model.predict(X_test[78])

array(['negative'], dtype=object)

In [74]:
stop_words = set(stopwords.words('english'))

In [73]:
import joblib
joblib.dump(stop_words,'stopwords.pkl') 
joblib.dump(model,'model.pkl')
joblib.dump(vectorizer,'vectorizer.pkl')


NameError: name 'en_stopwords' is not defined

In [None]:
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,Y_train)
clf.score(X_test,Y_test)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(Y_test, y_pred))


In [None]:
message="id have responded if i were going	I`d have responded, if I "
data=[message]
data
vect = vectorizer.transform(data).toarray()
clf.predict(vect)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LR=LogisticRegression()

In [None]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [None]:
import pickle
# save the model to disk
filename = 'finalized_model.h5'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

In [None]:
loaded_model.score(X_train, Y_train)

In [None]:
model.predict(X_test[0])


In [None]:
#random classifier
from sklearn.ensemble import RandomForestClassifier
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, Y_train)





In [None]:
predictions = text_classifier.predict(X_test)

In [None]:
predictions

In [None]:
import pickle
# save the model to disk
filename = 'random_tweets.h5'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

In [None]:
print(text_classifier.score(X_train, Y_train),text_classifier.score(X_test, Y_test))


In [None]:
message="i hate myself"
data22=[message]
vect=vectorizer.transform(data22).toarray()
model.predict(vect)

In [None]:
    text_classifier.predict(vectorizer.transform(vector.toarray()))

In [None]:
#Score of the XGBoost Classifier
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [None]:
model.score(X_train,Y_train)


In [None]:
model.score(X_test,Y_test)

In [None]:
model.predict(X[0])

In [None]:
model.predict_proba(X_test[35])

In [None]:
tender="['doctor', 'mbbs', 'student', 'found', 'shot', 'dead', 'hostel']"


In [None]:
models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('MNB',MultinomialNB()),
          ('XGB', XGBClassifier())
        ]

In [None]:
model_scores.append(["Naive Bayes Classifier"],[clf.score(X_train,Y_train)],[clf.score(X_test,Y_test)])