## Package Importing

In [48]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

import pickle

import warnings
warnings.filterwarnings('ignore')

# Data Analysis

In [49]:
# Importing Data

data = pd.read_excel("D:\Imverse AI assignment\immverse_ai_eval_dataset.xlsx")

In [50]:
data.head(5)

Unnamed: 0,id,sentence,voice
0,1,The chef prepares the meal.,Active
1,2,The teacher explains the lesson clearly.,Active
2,3,The gardener waters the plants every morning.,Active
3,4,The kids play soccer in the park.,Active
4,5,The author wrote a thrilling novel.,Active


In [51]:
data.shape

(40, 3)

In [52]:
data.nunique

<bound method DataFrame.nunique of     id                                           sentence    voice
0    1                        The chef prepares the meal.   Active
1    2           The teacher explains the lesson clearly.   Active
2    3      The gardener waters the plants every morning.   Active
3    4                  The kids play soccer in the park.   Active
4    5                The author wrote a thrilling novel.   Active
5    6     The scientist conducts experiments in the lab.   Active
6    7                The company launched a new product.   Active
7    8            The artist paints a beautiful portrait.   Active
8    9                    The musician composes a melody.   Active
9   10          The photographer takes stunning pictures.   Active
10  11  The director shoots the film in various locati...   Active
11  12        The journalist reports the news accurately.   Active
12  13               The designer creates a modern dress.   Active
13  14                 The 

In [53]:
data.isnull().sum()

id          0
sentence    0
voice       0
dtype: int64

In [54]:
data.duplicated().sum()

0

In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        40 non-null     int64 
 1   sentence  40 non-null     object
 2   voice     40 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.1+ KB


In [56]:
data.describe(include='object')

Unnamed: 0,sentence,voice
count,40,40
unique,40,2
top,The chef prepares the meal.,Active
freq,1,20


In [57]:
data['sentence'][0:15]

0                           The chef prepares the meal.
1              The teacher explains the lesson clearly.
2         The gardener waters the plants every morning.
3                     The kids play soccer in the park.
4                   The author wrote a thrilling novel.
5        The scientist conducts experiments in the lab.
6                   The company launched a new product.
7               The artist paints a beautiful portrait.
8                       The musician composes a melody.
9             The photographer takes stunning pictures.
10    The director shoots the film in various locati...
11          The journalist reports the news accurately.
12                 The designer creates a modern dress.
13                   The engineer designs a new bridge.
14                The programmer codes the application.
Name: sentence, dtype: object

# Pre-Processing of Data

In [58]:
# Removing punctuation and convert to lowercase

def clean(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

In [59]:

data['sentence'] = data['sentence'].apply(clean)

In [60]:
data['sentence'].head()

0                      the chef prepares the meal
1         the teacher explains the lesson clearly
2    the gardener waters the plants every morning
3                the kids play soccer in the park
4              the author wrote a thrilling novel
Name: sentence, dtype: object

In [61]:
# Creating Dummy Variable

data = pd.get_dummies(data, columns=['voice'], drop_first=True)

In [62]:
data.head()

Unnamed: 0,id,sentence,voice_Passive
0,1,the chef prepares the meal,False
1,2,the teacher explains the lesson clearly,False
2,3,the gardener waters the plants every morning,False
3,4,the kids play soccer in the park,False
4,5,the author wrote a thrilling novel,False


In [63]:
data['voice_Passive'] = data['voice_Passive'].astype(int)

In [64]:
data.head()

Unnamed: 0,id,sentence,voice_Passive
0,1,the chef prepares the meal,0
1,2,the teacher explains the lesson clearly,0
2,3,the gardener waters the plants every morning,0
3,4,the kids play soccer in the park,0
4,5,the author wrote a thrilling novel,0


In [65]:
data['voice_Passive'].tail(5)

35    1
36    1
37    1
38    1
39    1
Name: voice_Passive, dtype: int32

# Train_test_split and Cross-Validation

In [66]:
# Assigning x and y for training and testing

x = data['sentence']
y = data['voice_Passive']

In [67]:
# Split Data

x_train, x_tmp, y_train, y_tmp = train_test_split(x, y, train_size=0.6, random_state= 51)

x_val, x_test, y_val, y_test = train_test_split(x_tmp, y_tmp, test_size=0.5, random_state= 51)

# Applying Vectorization

In [68]:
Vectorizer = CountVectorizer()

In [69]:
x_train_v = Vectorizer.fit_transform(x_train)
x_val_v = Vectorizer.transform(x_val)
x_test_v = Vectorizer.transform(x_test)

# Model Classification Testing

In [70]:
# Logistic Regression

logit = LogisticRegression()
logit.fit(x_train_v, y_train)

In [71]:
y_pred = logit.predict(x_val_v)

In [72]:
score = accuracy_score(y_val, y_pred)
score

1.0

In [73]:
# Support Vector Machine

svc = SVC()
svc.fit(x_train_v, y_train)

In [74]:
y_pred = svc.predict(x_val_v)

In [75]:
score = accuracy_score(y_val, y_pred)
score

1.0

In [76]:
#KNN
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train_v,y_train)

In [77]:
y_pred= knn.predict(x_val_v)
score=accuracy_score(y_val,y_pred)
score

0.875

In [78]:
# DecisionTree

tree = DecisionTreeClassifier()
tree.fit(x_train_v, y_train)

In [79]:
y_pred = tree.predict(x_val_v)

In [80]:
score = accuracy_score(y_val, y_pred)
score

1.0

In [81]:
RFC=RandomForestClassifier()
RFC.fit(x_train_v,y_train)

In [82]:
y_pred = RFC.predict(x_val_v)
score = accuracy_score(y_val, y_pred)
score

1.0

# Findings : 
1.Logistic Regression : 1.0

2.Support Vector Classifier : 1.0

3.KNN Classifier: 0.875

4.Decision Tree Classifier : 1.0

5.Random Forest : 1.0



Decision Tree has most accuracy hence, best fit for the prediction to choose.

# Test Prediction

In [83]:
test_predict = tree.predict(x_test_v)

In [84]:
score = accuracy_score(y_test, test_predict)
score

1.0

# Actual vs Predicted value Testing

In [85]:
predicted = pd.DataFrame(y_test)
predicted['Predicted_voice_passive'] = test_predict

In [86]:
predicted

Unnamed: 0,voice_Passive,Predicted_voice_passive
11,0,0
2,0,0
14,0,0
18,0,0
34,1,1
15,0,0
0,0,0
23,1,1


# Creating files for deploying the Projects

In [87]:
filename1='vectorizer.sav'
pickle.dump(Vectorizer,open(filename1,'wb'))

filename2='model.sav'
pickle.dump(tree,open(filename2,'wb'))

# Custom Testing of the files

In [88]:
vect = pickle.load(open('vectorizer.sav', 'rb'))
model = pickle.load(open('model.sav', 'rb'))

In [89]:
text1 = 'I make a cake'
text2 = 'A cake is made by me'

In [90]:
text_list = [text1, text2]

In [91]:
def testing(text):
    text_v = vect.transform([text])
    result = model.predict(text_v)

    if result == 1:
        return f'{text}  : This is a Passive Statement'
    else:
        return f'{text}  : This is an Active Statement'

In [92]:
for text in text_list:
    print(testing(text))

I make a cake  : This is an Active Statement
A cake is made by me  : This is a Passive Statement


In [93]:
# This Predict Accurately