# Importing Libraries 

In [1]:
import pandas as pd                #Used for Data Analysis.
import numpy as np                 #Used for scientific computing.
import matplotlib.pyplot as plt    #Used for Data Visualization.
import re           
#Regular Expressions Library,Searching within and changing text using formal patterns.
import string                      #Contains constants and classes for working with text.
import nltk                        #Natural language toolkit
import imblearn                    #To Deal with Imbalanced data.
from nltk.corpus import stopwords  #Stop words are a set of commonly used words in any language(used to remove stop words.).
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import tokenize          #Basically refers to splitting up a larger body of text into smaller lines,words.
from collections import Counter    #A Counter is a container that keeps track of how many times equivalent values are added

# Importing Dataset

In [2]:
df=pd.read_excel(r"C:\Users\TARUN\Desktop\NLP Project\1000 leads.xlsx")

# Data Pre-processing

In [3]:
df.head()

Unnamed: 0,Lead Name,Location,Status,Status information
0,Raja,hyderabad,Not Converted,"14/8/prema: share me details, available in evn..."
1,Anirudh Reddy,pune,Not Converted,"14/8/prema: cal me tmrw, shared details to ema..."
2,Sapna Dewani,bangalore,Converted,16|AuG|moHan:rnr
3,suresh,mumbai,Not Converted,14/8/17(Surendra):i want only Server 16|AuG|mo...
4,Akshay Shinde,hyderabad,Not Converted,"14/8/prema:rnr 16/8/prema: gave info, he said ..."


In [4]:
df.describe

<bound method NDFrame.describe of                 Lead Name   Location        Status   \
0                    Raja  hyderabad  Not Converted   
1           Anirudh Reddy       pune  Not Converted   
2            Sapna Dewani  bangalore     Converted    
3                  suresh     mumbai  Not Converted   
4           Akshay Shinde  hyderabad  Not Converted   
...                   ...        ...            ...   
996                 vipin  bangalore  Not Converted   
997               dheeraj    chennai  Not Converted   
998         kuldeep singh  bangalore  Not Converted   
999          ankur sharma  bangalore  Not Converted   
1000  Saugata Chakrabarti  bangalore  Not Converted   

                                     Status information  
0     14/8/prema: share me details, available in evn...  
1     14/8/prema: cal me tmrw, shared details to ema...  
2                                      16|AuG|moHan:rnr  
3     14/8/17(Surendra):i want only Server 16|AuG|mo...  
4     14/8/prem

In [5]:
#Finding null values.
df.isna().sum()

Lead Name              0
Location              21
Status                 3
Status information    24
dtype: int64

In [6]:
#Dropping null values.
df.dropna(axis=0,inplace=True)

In [7]:
#Finding null values after removing.
df.isna().sum()

Lead Name             0
Location              0
Status                0
Status information    0
dtype: int64

In [8]:
#Printing Column Names
df.columns

Index(['Lead Name', 'Location', 'Status ', 'Status information'], dtype='object')

In [9]:
#Removing Unwanted Spaces in Column Names.
df.columns = df.columns.str.strip()                                                       #aslo can replace with replace(' ','')

In [10]:
#Column Names after removing unwanted spaces.
df.columns

Index(['Lead Name', 'Location', 'Status', 'Status information'], dtype='object')

In [11]:
#Finding null values in Status column.
df['Status'].isna().sum()

0

In [12]:
#Finding unique values in Status column.
df['Status'].unique()  

array(['Not Converted', 'Converted ', 'NOt Converted', 'Conveted'],
      dtype=object)

In [13]:
#Removing Unwanted Spaces in Column Name 'Status'
df['Status']=df['Status'].str.strip()

In [14]:
#Status column Data after removing unwanted spaces.
df['Status'].unique()  

array(['Not Converted', 'Converted', 'NOt Converted', 'Conveted'],
      dtype=object)

In [15]:
#Counting Unique values count.
df['Status'].value_counts()

Not Converted    818
Converted        120
NOt Converted     11
Conveted           7
Name: Status, dtype: int64

In [16]:
#Modifying 'Status' Column Data.
df['Status']=df["Status"].replace('Conveted','Converted')
df['Status']=df["Status"].replace('NOt Converted','Not Converted') 

In [17]:
#Counting Unique values count.
df['Status'].value_counts()

Not Converted    829
Converted        127
Name: Status, dtype: int64

In [18]:
#Modifying Column Name 'Status information' DataType as String.
df['Status information']=df['Status information'].astype(str)

In [19]:
df.dtypes

Lead Name             object
Location              object
Status                object
Status information    object
dtype: object

In [20]:
df.head()

Unnamed: 0,Lead Name,Location,Status,Status information
0,Raja,hyderabad,Not Converted,"14/8/prema: share me details, available in evn..."
1,Anirudh Reddy,pune,Not Converted,"14/8/prema: cal me tmrw, shared details to ema..."
2,Sapna Dewani,bangalore,Converted,16|AuG|moHan:rnr
3,suresh,mumbai,Not Converted,14/8/17(Surendra):i want only Server 16|AuG|mo...
4,Akshay Shinde,hyderabad,Not Converted,"14/8/prema:rnr 16/8/prema: gave info, he said ..."


# Data Cleaning

In [21]:
#Not done Stemming or Lemmatization
#Stemminng is the process of reducing(reduce) inflection in words to their "root" forms.Such as mapping(map) a group of words to the same stem.
#Stemming and Lemmatization return a word to its simpler root form.
#Stemming returns “studi” as the root form of “studies”. Lemmatization returns “study” as the root form of “studies”. 
#The root form returned by lemmatization has a meaning. The root form of stemming sometimes does not have a meaning.
def clean_text(text):
    text=text.lower() 
    text=re.sub(r"\d+","",text) #Removes special characters and digits.
    text=text.translate(str.maketrans("","",string.punctuation))  #.,! removes
    text=re.sub('\s+',' ',text) #Eliminate duplicate whitespaces using wildcards(iam   legend.)
    text=[token for token in text.split() if len(token)>2]   #if token>2 accept
    text=' '.join(text) #joining the text
    shortword=re.compile(r'\W*\b\w{10,}\b')   #
    text=shortword.sub('',text)
    #stop words are words which are filtered out before or after processing of natural language data.(ex:he,she,is,that,this)
    #Frequently occurring words are removed from the corpus for the sake of text-normalization.
    stop_words=set(stopwords.words('english')) 
    #Tokenization is the process of breaking text up into smaller chunks as per our requirements.
    #Word tokenization is the process of breaking a sentence into words
    tokens=word_tokenize(text) 
    
    text=''.join(text)
    return text

In [22]:
df.head()

Unnamed: 0,Lead Name,Location,Status,Status information
0,Raja,hyderabad,Not Converted,"14/8/prema: share me details, available in evn..."
1,Anirudh Reddy,pune,Not Converted,"14/8/prema: cal me tmrw, shared details to ema..."
2,Sapna Dewani,bangalore,Converted,16|AuG|moHan:rnr
3,suresh,mumbai,Not Converted,14/8/17(Surendra):i want only Server 16|AuG|mo...
4,Akshay Shinde,hyderabad,Not Converted,"14/8/prema:rnr 16/8/prema: gave info, he said ..."


In [23]:
df['Status information']=df['Status information'].apply(clean_text)

In [24]:
df.head()

Unnamed: 0,Lead Name,Location,Status,Status information
0,Raja,hyderabad,Not Converted,prema share details available evng prema postp...
1,Anirudh Reddy,pune,Not Converted,prema cal tmrw shared details email prema shar...
2,Sapna Dewani,bangalore,Converted,
3,suresh,mumbai,Not Converted,surendrai want only server busy for server
4,Akshay Shinde,hyderabad,Not Converted,premarnr prema gave info said will revert prem...


In [25]:
X=df['Status information'] #independent variables(X)

In [26]:
y=df['Status'].astype(str) #dependent variable(y)

In [27]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()                            #Encoding is a technique of converting categorical variables into numerical values.
y=le.fit_transform(y)                        #Coverting into numerical format.

In [28]:
y

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,

# Text Pre-processing

In [29]:
#TF-IDF
#TF-Term Frequency=Number of times the word occurs in the sentence/Total number of words in the sentence.
#Inverse Document frequency=log(Number of sentences/Number of sentences containing words.)
#Term Frequency–Inverse Document frequency.
#It is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
#Term frequency-inverse document frequency is a text vectorizer that transforms the text into a usable vector. 
#It combines 2 concepts, Term Frequency (TF) and Document Frequency (DF). The term frequency is the number of occurrences of a specific term in a document.

In [30]:
df_tfidf=vectorizer.fit_transform(X)          #Results sparse matrix.

In [31]:
df_tfidf

<956x1087 sparse matrix of type '<class 'numpy.float64'>'
	with 8487 stored elements in Compressed Sparse Row format>

In [32]:
#Converting sparse matrix into dataframe.
train_df = pd.DataFrame(df_tfidf.toarray(),columns=vectorizer.get_feature_names())

In [33]:
train_df.head()

Unnamed: 0,abdul,able,about,abroad,abt,accepting,actually,addres,address,admin,...,wwant,xcler,xlr,yesterday,yesterdy,yet,you,your,yrs,ystday
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.257311,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
train_df.columns

Index(['abdul', 'able', 'about', 'abroad', 'abt', 'accepting', 'actually',
       'addres', 'address', 'admin',
       ...
       'wwant', 'xcler', 'xlr', 'yesterday', 'yesterdy', 'yet', 'you', 'your',
       'yrs', 'ystday'],
      dtype='object', length=1087)

In [35]:
#Dealing with imbalanced data.
#Applying SMOTE(Synthetic Minority Over-Sampling Technique.)
from imblearn import under_sampling,over_sampling
from collections import Counter
from imblearn.over_sampling import SMOTE
ros=SMOTE()
x_ros,y_ros=ros.fit_resample(train_df,y)
print('Original dataset shape %s' % Counter(y_ros))

Original dataset shape Counter({1: 829, 0: 829})


# Model Building

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(x_ros, y_ros, test_size=0.20, random_state=1)

In [37]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score

In [38]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier() 
rf.fit(X_train,y_train)

RandomForestClassifier()

In [39]:
rf_pred=rf.predict(X_cv)

In [40]:
print(confusion_matrix(y_cv,rf_pred))

[[152  10]
 [ 24 146]]


In [41]:
print(classification_report(y_cv,rf_pred))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90       162
           1       0.94      0.86      0.90       170

    accuracy                           0.90       332
   macro avg       0.90      0.90      0.90       332
weighted avg       0.90      0.90      0.90       332



In [42]:
print(accuracy_score(y_cv,rf_pred))

0.8975903614457831


In [43]:
print(f1_score(y_cv,rf_pred,average='micro'))

0.8975903614457831


In [44]:
rf_pred

array([1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,