In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_excel("1000 leads.xlsx").drop('Unnamed: 4', axis=1)

In [3]:
df.columns

Index(['Lead Name', 'Location', 'Status ', 'Status information'], dtype='object')

In [4]:
df

Unnamed: 0,Lead Name,Location,Status,Status information
0,Raja,hyderabad,Not Converted,"14/8/prema: share me details, available in evn..."
1,Anirudh Reddy,pune,Not Converted,"14/8/prema: cal me tmrw, shared details to ema..."
2,Sapna Dewani,bangalore,Converted,16|AuG|moHan:rnr
3,suresh,mumbai,Not Converted,14/8/17(Surendra):i want only Server 16|AuG|mo...
4,Akshay Shinde,hyderabad,Not Converted,"14/8/prema:rnr 16/8/prema: gave info, he said ..."
...,...,...,...,...
996,vipin,bangalore,Not Converted,25/4/17(Surendra):please send me details tomor...
997,dheeraj,chennai,Not Converted,"24/4/17(prema): need CT, but our venue is far...."
998,kuldeep singh,bangalore,Not Converted,24/Apr/moHan:intrstd in evng batch trail perd ...
999,ankur sharma,bangalore,Not Converted,1/5/17(Surendra):please share me details 11/5/...


In [5]:
#Lowercasing
df['Status information'] = df['Status information'].str.lower()

In [6]:
df['Status information']

0       14/8/prema: share me details, available in evn...
1       14/8/prema: cal me tmrw, shared details to ema...
2                                        16|aug|mohan:rnr
3       14/8/17(surendra):i want only server 16|aug|mo...
4       14/8/prema:rnr 16/8/prema: gave info, he said ...
                              ...                        
996     25/4/17(surendra):please send me details tomor...
997     24/4/17(prema): need ct, but our venue is far....
998     24/apr/mohan:intrstd in evng batch trail perd ...
999     1/5/17(surendra):please share me details 11/5/...
1000    24/4/17(prema): shared details need only cert....
Name: Status information, Length: 1001, dtype: object

In [7]:
#Removing Punctuations
import string
exclude = string.punctuation.replace("'", "", 1)

def remove_punc(text):
    for char in exclude:
        text = str(text).replace(char," ")
    return text

In [8]:
df['Status information'] = df['Status information'].apply(remove_punc)

In [9]:
df['Status information']

0       14 8 prema  share me details  available in evn...
1       14 8 prema  cal me tmrw  shared details to ema...
2                                        16 aug mohan rnr
3       14 8 17 surendra  i want only server 16 aug mo...
4       14 8 prema rnr 16 8 prema  gave info  he said ...
                              ...                        
996     25 4 17 surendra  please send me details tomor...
997     24 4 17 prema   need ct  but our venue is far ...
998     24 apr mohan intrstd in evng batch trail perd ...
999     1 5 17 surendra  please share me details 11 5 ...
1000    24 4 17 prema   shared details need only cert ...
Name: Status information, Length: 1001, dtype: object

In [10]:
chat_words = pd.read_excel("Chat Words Short.xlsx")
chat_words['symbol']=chat_words['symbol'].str.lower().str.rstrip(" ")
chat_words['meaning'] = chat_words['meaning'].str.lower().str.rstrip(" ")

In [11]:
chat_words[chat_words['symbol'] == 'rofl']['meaning']

Series([], Name: meaning, dtype: object)

In [12]:
chat_words['meaning'][0].rstrip(" ")

' rolling on floor laughing'

In [13]:
def chat_conversion(text):
    new_text = []
    #set_words = set()
    for w in text.lower().split():
        #print(w)
        if w in chat_words['symbol'].tolist():
            #print(w)
            #set_words.add(w)
            new_text.append(chat_words[chat_words['symbol']==w]['meaning'].values[0])
        else:
            new_text.append(w)
    #print(new_text)
    #print(set_words)
    return " ".join(new_text)

In [14]:
chat_conversion('one of the other reviewers has mentioned rofl')

'one of the other reviewers has mentioned rofl'

In [15]:
df['Status information'] = df['Status information'].apply(chat_conversion)

In [16]:
df['Status information'][897]

'3 5 17 surendra rnr 9 5 17 surendra call disconnected'

In [17]:
df[df['Lead Name']=='madhuri']['Status information'][896]

'4 5 17 prema shared details need to call later 7 30pm 9 5 17 prema rnr 15 5 17 gowtham not interested'

In [18]:
#Spelling Correction:
from textblob import TextBlob

In [19]:
def spelling_correct(text):
    #print('----------------------------------------------------------------------------------------------')
    #print(text)
    #print("\n")
    return str(TextBlob(text).correct())

In [20]:
spelling_correct("Friyay")

'Friday'

In [21]:
df['Status information'] = df['Status information'].apply(spelling_correct)

In [22]:
df[df['Lead Name']=='madhuri']['Status information'][896]

'4 5 17 prima shared details need to call later 7 pm 9 5 17 prima and 15 5 17 gowtham not interested'

In [23]:
#Removing stopwords
from nltk.corpus import stopwords
stopwords.words("English")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [24]:
stop_words= stopwords.words("English")

In [25]:
stop_words.remove('not')

In [26]:
if 'not' in stop_words:
    print('in')
else: 
    print('not found')

not found


In [27]:
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stop_words:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    #print(x)
    new_text.clear()
    return " ".join(x)

In [28]:
df['Status information'] = df['Status information'].apply(remove_stopwords)

In [29]:
df[df['Lead Name']=='Risabh']['Status information'][874]

'24 5 17 gowtham already done'

In [30]:
df['Status information'] = df['Status information'].str.replace('prima', 'prema')

In [31]:
#Handling Emojis
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"
                              u"\U0001F300-\U0001F5FF"
                              u"\U0001F680-\U0001F6FF"
                              u"\U0001F1E0-\U0001F1FF"
                              u"\U00002702-\U000027B0"
                              u"\U000024C2-\U0001F251"
                              "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'',text)

In [32]:
df['Status information'] = df['Status information'].apply(remove_emoji)

In [33]:
#Tokenization
from nltk.tokenize import word_tokenize, sent_tokenize

In [34]:
df['Status information'] = df['Status information'].apply(word_tokenize)

In [35]:
df['Status information']

0       [14, 8, prema, share, details, available, long...
1       [14, 8, prema, call, tomorrow, shared, details...
2                                        [16, aug, moran]
3       [14, 8, 17, surendra, want, server, 16, aug, m...
4       [14, 8, prema, 16, 8, prema, gave, said, rever...
                              ...                        
996     [25, 4, 17, surendra, please, send, details, t...
997     [24, 4, 17, prema, need, venue, far, shared, d...
998     [24, air, moran, intrusted, long, batch, trail...
999     [1, 5, 17, surendra, please, share, details, 1...
1000    [24, 4, 17, prema, shared, details, need, cent...
Name: Status information, Length: 1001, dtype: object

In [36]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [37]:
#Stemming
def stem_words(text):
    new_text = []
    for word in text:
        new_text.append(ps.stem(word))
    return " ".join(new_text)

In [38]:
df['Status information_stem'] = df['Status information'].apply(stem_words)

In [39]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [40]:
def lemmatize_words(text):
    new_text = []
    for word in text:
        new_text.append(wnl.lemmatize(word, pos='v'))
    return " ".join(new_text)

In [41]:
df['Status information_lem'] = df['Status information'].apply(lemmatize_words)

In [42]:
df

Unnamed: 0,Lead Name,Location,Status,Status information,Status information_stem,Status information_lem
0,Raja,hyderabad,Not Converted,"[14, 8, prema, share, details, available, long...",14 8 prema share detail avail long 18 8 prema ...,14 8 prema share detail available long 18 8 pr...
1,Anirudh Reddy,pune,Not Converted,"[14, 8, prema, call, tomorrow, shared, details...",14 8 prema call tomorrow share detail email 16...,14 8 prema call tomorrow share detail email 16...
2,Sapna Dewani,bangalore,Converted,"[16, aug, moran]",16 aug moran,16 aug moran
3,suresh,mumbai,Not Converted,"[14, 8, 17, surendra, want, server, 16, aug, m...",14 8 17 surendra want server 16 aug moran call...,14 8 17 surendra want server 16 aug moran call...
4,Akshay Shinde,hyderabad,Not Converted,"[14, 8, prema, 16, 8, prema, gave, said, rever...",14 8 prema 16 8 prema gave said revert 30 8 pr...,14 8 prema 16 8 prema give say revert 30 8 pre...
...,...,...,...,...,...,...
996,vipin,bangalore,Not Converted,"[25, 4, 17, surendra, please, send, details, t...",25 4 17 surendra pleas send detail tomorrow af...,25 4 17 surendra please send detail tomorrow a...
997,dheeraj,chennai,Not Converted,"[24, 4, 17, prema, need, venue, far, shared, d...",24 4 17 prema need venu far share detail 25 4 ...,24 4 17 prema need venue far share detail 25 4...
998,kuldeep singh,bangalore,Not Converted,"[24, air, moran, intrusted, long, batch, trail...",24 air moran intrust long batch trail per 26 a...,24 air moran intrust long batch trail per 26 a...
999,ankur sharma,bangalore,Not Converted,"[1, 5, 17, surendra, please, share, details, 1...",1 5 17 surendra pleas share detail 11 5 17 gow...,1 5 17 surendra please share detail 11 5 17 go...


In [43]:
df

Unnamed: 0,Lead Name,Location,Status,Status information,Status information_stem,Status information_lem
0,Raja,hyderabad,Not Converted,"[14, 8, prema, share, details, available, long...",14 8 prema share detail avail long 18 8 prema ...,14 8 prema share detail available long 18 8 pr...
1,Anirudh Reddy,pune,Not Converted,"[14, 8, prema, call, tomorrow, shared, details...",14 8 prema call tomorrow share detail email 16...,14 8 prema call tomorrow share detail email 16...
2,Sapna Dewani,bangalore,Converted,"[16, aug, moran]",16 aug moran,16 aug moran
3,suresh,mumbai,Not Converted,"[14, 8, 17, surendra, want, server, 16, aug, m...",14 8 17 surendra want server 16 aug moran call...,14 8 17 surendra want server 16 aug moran call...
4,Akshay Shinde,hyderabad,Not Converted,"[14, 8, prema, 16, 8, prema, gave, said, rever...",14 8 prema 16 8 prema gave said revert 30 8 pr...,14 8 prema 16 8 prema give say revert 30 8 pre...
...,...,...,...,...,...,...
996,vipin,bangalore,Not Converted,"[25, 4, 17, surendra, please, send, details, t...",25 4 17 surendra pleas send detail tomorrow af...,25 4 17 surendra please send detail tomorrow a...
997,dheeraj,chennai,Not Converted,"[24, 4, 17, prema, need, venue, far, shared, d...",24 4 17 prema need venu far share detail 25 4 ...,24 4 17 prema need venue far share detail 25 4...
998,kuldeep singh,bangalore,Not Converted,"[24, air, moran, intrusted, long, batch, trail...",24 air moran intrust long batch trail per 26 a...,24 air moran intrust long batch trail per 26 a...
999,ankur sharma,bangalore,Not Converted,"[1, 5, 17, surendra, please, share, details, 1...",1 5 17 surendra pleas share detail 11 5 17 gow...,1 5 17 surendra please share detail 11 5 17 go...


In [44]:
df['Status '].value_counts()

Not Converted    856
Converted        124
NOt Converted     11
Conveted           7
Name: Status , dtype: int64

In [45]:
df.columns

Index(['Lead Name', 'Location', 'Status ', 'Status information',
       'Status information_stem', 'Status information_lem'],
      dtype='object')

In [46]:
df['Status '] = np.where(df['Status '] == 'NOt Converted', 'Not Converted', np.where(df['Status '] == 'Conveted', 'Converted', np.where(df['Status ']=='Converted ', 'Converted', df['Status '])))

In [47]:
df['Status '].value_counts()

Not Converted    867
Converted        131
Name: Status , dtype: int64

In [48]:
[df['Status ']=='Converted']

[0       False
 1       False
 2        True
 3       False
 4       False
         ...  
 996     False
 997     False
 998     False
 999     False
 1000    False
 Name: Status , Length: 1001, dtype: bool]

In [49]:
#Data is imbalanced
df_converted = df[df['Status ']=='Converted']
df_notconverted = df[df['Status ']=='Not Converted']

In [50]:
from sklearn.utils import resample
data_upsample = resample(df_converted, replace=True, n_samples = len(df_notconverted), random_state = 42)

In [51]:
data_upsample.shape

(867, 6)

In [52]:
df_notconverted.shape

(867, 6)

In [53]:
df_new = pd.concat([df_notconverted, data_upsample])
df_new.reset_index(drop=True, inplace=True)

In [54]:
df_new.shape

(1734, 6)

In [55]:
X = pd.DataFrame(df_new['Status information_stem'], columns = ['Status information_stem'])
#X = df_new['Status information_stem'].values
y = df_new['Status ']

In [56]:
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()

In [57]:
y = lc.fit_transform(y)

In [58]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X), y, test_size = 0.2, random_state=1)

In [60]:
X_train.columns

Index(['Status information_stem'], dtype='object')

In [61]:
#Applying BOW
from sklearn.feature_extraction.text import CountVectorizer
cv1 = CountVectorizer()

In [62]:
y_train

array([0, 1, 1, ..., 0, 1, 0])

In [63]:
X_train_bow = cv1.fit_transform(X_train['Status information_stem']).toarray()
X_test_bow = cv1.transform(X_test['Status information_stem']).toarray()

In [64]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [65]:
X_train_bow.shape

(1387, 711)

In [66]:
X_test_bow.shape

(347, 711)

In [67]:
y_test.shape

(347,)

In [68]:
def algo_implement(x_val_tr, x_val_te, ml_algo):
    output = []
    
    ml_algo.fit(x_val_tr, y_train)
    y_pred = ml_algo.predict(x_val_te)
    acc_score = accuracy_score(y_test, y_pred)
    for i in range(100):
        output.append(acc_score)
    acc_score_final = sum(output)/len(output)
    print(f"Precision Score = {precision_score(y_test, y_pred)}")
    print(f"Recall Score = {recall_score(y_test, y_pred)}")
    print(f"F1 Score = {f1_score(y_test, y_pred)}")
    print(f"Accuracy Score = {accuracy_score(y_test, y_pred)}")
    
    return acc_score_final

In [69]:
#ML Algo 1
from sklearn.naive_bayes import GaussianNB

ml1 = GaussianNB()
ml1_output = algo_implement(X_train_bow,X_test_bow,ml1)

print(f"The accuracy of Gaussian Model: {round(ml1_output*100,2)}%")

Precision Score = 1.0
Recall Score = 0.42196531791907516
F1 Score = 0.5934959349593496
Accuracy Score = 0.7118155619596542
The accuracy of Gaussian Model: 71.18%


In [70]:
#ML Algo 2
from sklearn.ensemble import RandomForestClassifier

ml2 = RandomForestClassifier()
ml2_output = algo_implement(X_train_bow,X_test_bow,ml2)

print(f"The accuracy of Random Forest Model: {round(ml2_output*100,2)}%")

Precision Score = 0.9877300613496932
Recall Score = 0.930635838150289
F1 Score = 0.9583333333333334
Accuracy Score = 0.9596541786743515
The accuracy of Random Forest Model: 95.97%


In [71]:
#ML Algo 3
from sklearn.svm import SVC

ml3 = SVC()
ml3_output = algo_implement(X_train_bow,X_test_bow,ml3)

print(f"The accuracy of Support Vector Machine Model: {round(ml3_output*100,2)}%")

Precision Score = 0.9007633587786259
Recall Score = 0.6820809248554913
F1 Score = 0.7763157894736842
Accuracy Score = 0.8040345821325648
The accuracy of Support Vector Machine Model: 80.4%


In [72]:
#Applying BOW with ngrams since we can check for Not Interested too 
from sklearn.feature_extraction.text import CountVectorizer
cv2 = CountVectorizer(ngram_range = (1,2))

In [73]:
X_train_ngram = cv2.fit_transform(X_train['Status information_stem']).toarray()
X_test_ngram = cv2.transform(X_test['Status information_stem']).toarray()

In [74]:
#ML Algo 1
from sklearn.naive_bayes import GaussianNB

mln1 = GaussianNB()
mln1_output = algo_implement(X_train_ngram,X_test_ngram,mln1)

print(f"The accuracy of Gaussian Model: {round(mln1_output*100,2)}%")

Precision Score = 1.0
Recall Score = 0.7976878612716763
F1 Score = 0.887459807073955
Accuracy Score = 0.899135446685879
The accuracy of Gaussian Model: 89.91%


In [75]:
#ML Algo 2
from sklearn.ensemble import RandomForestClassifier

mln2 = RandomForestClassifier()
mln2_output = algo_implement(X_train_ngram,X_test_ngram,mln2)

print(f"The accuracy of Random Forest Model: {round(mln2_output*100,2)}%")

Precision Score = 0.9878787878787879
Recall Score = 0.9421965317919075
F1 Score = 0.9644970414201184
Accuracy Score = 0.9654178674351584
The accuracy of Random Forest Model: 96.54%


In [76]:
#ML Algo 3
from sklearn.svm import SVC

mln3 = SVC()
mln3_output = algo_implement(X_train_ngram,X_test_ngram,mln3)

print(f"The accuracy of Support Vector Machine Model: {round(mln3_output*100,2)}%")

Precision Score = 0.9338235294117647
Recall Score = 0.7341040462427746
F1 Score = 0.8220064724919094
Accuracy Score = 0.8414985590778098
The accuracy of Support Vector Machine Model: 84.15%


In [77]:
#Applying TF IDF: Term Frequency-Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfVectorizer
cv3 = TfidfVectorizer()

In [78]:
X_train_tfidf = cv3.fit_transform(X_train['Status information_stem']).toarray()
X_test_tfidf = cv3.transform(X_test['Status information_stem']).toarray()

In [79]:
#ML Algo 1
from sklearn.naive_bayes import GaussianNB

mlt1 = GaussianNB()
mlt1_output = algo_implement(X_train_tfidf,X_test_tfidf,mlt1)

print(f"The accuracy of Gaussian Model: {round(mlt1_output*100,2)}%")

Precision Score = 1.0
Recall Score = 0.43352601156069365
F1 Score = 0.6048387096774194
Accuracy Score = 0.7175792507204611
The accuracy of Gaussian Model: 71.76%


In [80]:
#ML Algo 2
from sklearn.ensemble import RandomForestClassifier

mlt2 = RandomForestClassifier()
mlt2_output = algo_implement(X_train_tfidf,X_test_tfidf,mlt2)

print(f"The accuracy of Random Forest Model: {round(mlt2_output*100,2)}%")

Precision Score = 0.9880239520958084
Recall Score = 0.953757225433526
F1 Score = 0.9705882352941175
Accuracy Score = 0.9711815561959655
The accuracy of Random Forest Model: 97.12%


In [81]:
#ML Algo 3
from sklearn.svm import SVC

mlt3 = SVC()
mlt3_output = algo_implement(X_train_tfidf,X_test_tfidf,mlt3)

print(f"The accuracy of Support Vector Machine Model: {round(mlt3_output*100,2)}%")

Precision Score = 0.9612903225806452
Recall Score = 0.861271676300578
F1 Score = 0.9085365853658537
Accuracy Score = 0.9135446685878963
The accuracy of Support Vector Machine Model: 91.35%


In [82]:
#We can see that we are getting higher accuracy with Random Forest and that for ngrams = 1,2 

In [83]:
model_final = mlt2.fit(X_train_tfidf, y_train)

In [84]:
import pickle

In [85]:
filename = "model.sav"

In [86]:
#pickle.dump(model_final, open(filename, 'wb'))

pickle.dump(model_final,open('model.pkl','wb'))
pickle.dump(cv3,open('cv.pkl','wb'))

In [87]:
load_model = pickle.load(open('model.pkl', 'rb'))

In [88]:
load_model.predict([X_test_tfidf[4]])

array([0])

In [89]:
X_test_tfidf[4]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.07834533, 0.        ,
       0.        , 0.        , 0.19786694, 0.        , 0.        ,
       0.        , 0.31899182, 0.        , 0.11302607, 0.        ,
       0.08531886, 0.        , 0.        , 0.        , 0.        ,
       0.10529502, 0.10891354, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.08559061, 0.        ,
       0.09629111, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.16673666, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [90]:
y_test[4]

0

In [91]:
new_input = [input()]

val = cv3.transform(new_input)

check = val.toarray()

anything


In [92]:
check

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [93]:
load_model.predict(check)

array([1])

In [94]:
def preprocess(val):
    val = str(val)
    print(val)
    #Lowercasing
    val = val.lower()
    print("Step 1")
    
    #Removing punctuations
    val = remove_punc(val)
    print("Step 2")
    
    #Removing Chat words
    val = chat_conversion(val)
    print("Step 3")
    
    #Spelling correction
    val = spelling_correct(val)
    val = val.replace('prima', 'prema')
    print("Step 4")
    
    #Remove stop words
    val = remove_stopwords(val)
    print("Step 5")
    
    #Remove emojis
    val = remove_emoji(val)
    print("Step 6")
    
    #Tokenize
    val = word_tokenize(val)
    print("Step 7")
    
    #Join
    val = " ".join(val)
    print(val)
    
    #Transform
    #val = cv3.transform(val).toarray()
    
    return val

In [95]:
input_new = "8/8/17(Surendra):i will think and get back"

In [96]:
val_pr = preprocess(input_new)

8/8/17(Surendra):i will think and get back
Step 1
Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
8 8 17 surendra think get back


In [97]:
val_pr

'8 8 17 surendra think get back'

In [98]:
val = cv3.transform([val_pr])

check = val.toarray()

In [99]:
check.shape

(1, 711)

In [100]:
load_model.predict(check)

array([1])

In [101]:
X_test.shape

(347, 1)