In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize  
import pickle
import os

In [35]:
df=pd.read_csv("data/text_emotion.csv")

In [36]:
df.head()

Unnamed: 0,text,emotion
0,carefully word blog posts amount criticism hea...,0
1,cannot remember little mermaid feeling carefre...,1
2,not feeling super well turns cold knocked next...,1
3,feel honored part group amazing talents,1
4,think helping also began feel pretty lonely lo...,0


In [37]:
df['emotion'].value_counts()

emotion
1    134205
0    120334
Name: count, dtype: int64

In [38]:
# balanced data set

In [39]:
df.shape

(254539, 2)

In [42]:
X=df['text']
y=df['emotion']

In [43]:
#train_test split

from sklearn.model_selection import train_test_split

In [44]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=df['emotion'])

In [45]:
# Preprocessing

In [46]:
import re
def remove_special(text):
    clean_text = re.sub(r"[^a-zA-Z]", " ", text)
    return clean_text

In [47]:

def remove_html(text):
    pattern=re.compile('<.*?>')
    return pattern.sub(r'', text)

In [48]:
def remove_extra_white_spaces(text):
    pattern=r'\s+[a-zA-Z]\s+'
    without_space=re.sub(pattern=pattern,repl=" ",string=text)
    return without_space
    

In [49]:
from nltk.corpus import stopwords
stopwords=stopwords.words('english')

In [50]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [51]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sapta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:



def remove_stop(text):
    new_text=[]
    for word in text.split():
        if word in stopwords:
            new_text.append('')
        else:
            new_text.append(word)
    x=new_text[:]
    new_text.clear()
    
    return " ".join(x)

In [53]:
X_train=X_train.reset_index()
X_test=X_test.reset_index()

In [54]:
X_train=X_train.drop(columns=['index'],axis=1)
X_test=X_test.drop(columns=['index'],axis=1)

In [55]:
X_train.head()

Unnamed: 0,text
0,feelings victimized
1,sat chair various times said anything times al...
2,still challenging feel reassured description son
3,feel sorry people openly slander pass ill judg...
4,love earlier master presence could feel lot ca...


In [56]:
def clean_text(X_train):
    X_train['text']=X_train['text'].str.lower()
    X_train['text']=X_train['text'].apply(remove_special)
    X_train['text']=X_train['text'].apply(remove_html)
    X_train['text']=X_train['text'].apply(remove_extra_white_spaces)
    X_train['text']=X_train['text'].apply(remove_stop)
    X_train['text']=X_train['text'].apply(stem_words)
    

    return X_train

In [59]:
X_train=clean_text(X_train)
X_test=clean_text(X_test)

In [60]:
X_train.head()

Unnamed: 0,text
0,feel victim
1,sat chair variou time said anyth time alway wa...
2,still challeng feel reassur descript son
3,feel sorri peopl openli slander pass ill judge...
4,love earlier master presenc could feel lot cas...


In [61]:
tfidf=TfidfVectorizer(max_features=2000,ngram_range=(1,2))

In [62]:
X_train_tfidf=tfidf.fit_transform(X_train['text'])
X_test_tfidf=tfidf.transform(X_test['text'])

In [63]:
X_train_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [64]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='liblinear')

In [65]:
model.fit(X_train_tfidf,y_train)

In [66]:
y_pred_test=model.predict(X_test_tfidf)
y_pred_train=model.predict(X_train_tfidf)

In [67]:
classifier=Pipeline(steps=[
    ('cleaner',clean_text()),
    ('vectorizer',tfidf),
    ('model',model)

])

TypeError: clean_text() missing 1 required positional argument: 'X_train'

In [32]:
from sklearn.metrics import accuracy_score
print("TRAIN: ",accuracy_score(y_train,y_pred_train)," ","TEST: ",accuracy_score(y_test,y_pred_test))

TRAIN:  0.9533076987295648   TEST:  0.950636442209476


NameError: name 'text_pipeline' is not defined

In [None]:
X_train

Unnamed: 0,text
0,feel victim
1,sat chair variou time said anyth time alway wa...
2,still challeng feel reassur descript son
3,feel sorri peopl openli slander pass ill judge...
4,love earlier master presenc could feel lot cas...
...,...
203626,felt need share one sentenc found one book del...
203627,posit idea sat feel groggi wait someth chang h...
203628,feel quit jolli even though pretti tire
203629,wear shirt feel artist artist look artist ye son


NameError: name 'word_tokenize' is not defined

In [None]:
remove_special("I,, can't go outside ")

'I   can t go outside '

In [None]:
content='I Have a Car'
content.lower()

'i have a car'

In [68]:
with open('model.pkl','rb') as file_obj:
               model=pickle.load(file_obj)


FileNotFoundError: [Errno 2] No such file or directory: 'model.pkl'

In [75]:

           
with open('artifacts/model.pkl','rb') as file_obj:
    model=pickle.load(file_obj)

FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/model.pkl'

In [72]:
model_path

'artifacts\\model.pkl'