## Text Classification for Active vs. Passive Voice Detection

In [1]:
# Packages

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pickle



In [2]:
# Data
data = pd.read_excel('immverse_ai_eval_dataset.xlsx')
df=data

In [3]:
data.head()


Unnamed: 0,id,sentence,voice
0,1,The chef prepares the meal.,Active
1,2,The teacher explains the lesson clearly.,Active
2,3,The gardener waters the plants every morning.,Active
3,4,The kids play soccer in the park.,Active
4,5,The author wrote a thrilling novel.,Active


### Pre-Processing

In [4]:
df.isnull().sum()   

id          0
sentence    0
voice       0
dtype: int64

In [5]:
# Remove punctuation and convert to lowercase

def clean(text):
    #text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

In [6]:
data['sentence'] = data['sentence'].apply(clean)

In [7]:
data['sentence'].head()

0                      the chef prepares the meal.
1         the teacher explains the lesson clearly.
2    the gardener waters the plants every morning.
3                the kids play soccer in the park.
4              the author wrote a thrilling novel.
Name: sentence, dtype: object

In [8]:
# Create Dummy Variable

data = pd.get_dummies(data, columns=['voice'], drop_first=True)

In [9]:
data.head(1)

Unnamed: 0,id,sentence,voice_Passive
0,1,the chef prepares the meal.,False


In [10]:
data['voice_Passive'] = data['voice_Passive'].astype(int)

In [11]:
data.head(1)

Unnamed: 0,id,sentence,voice_Passive
0,1,the chef prepares the meal.,0


In [12]:
data.tail(1)

Unnamed: 0,id,sentence,voice_Passive
1161,1162,the potential consequences are weighed by many...,1


### Train, Validate and Test Spliting

In [13]:
# Declair x and y

x = data['sentence']
y = data['voice_Passive']

In [14]:
# Split Data

x_train, x_tmp, y_train, y_tmp = train_test_split(x, y, train_size=0.6, random_state= 51)

x_val, x_test, y_val, y_test = train_test_split(x_tmp, y_tmp, test_size=0.5, random_state= 51)

### Vectorization

In [15]:
Vectorizer = CountVectorizer()

In [16]:
x_train_v = Vectorizer.fit_transform(x_train)
x_val_v = Vectorizer.transform(x_val)
x_test_v = Vectorizer.transform(x_test)

### Classifier Models

In [17]:
# Logistic Regression

logit = LogisticRegression()
logit.fit(x_train_v, y_train)

In [18]:
y_pred = logit.predict(x_val_v)

In [19]:
score = accuracy_score(y_val, y_pred)
score

0.8836206896551724

In [20]:
# Support Vector Machine

svc = SVC()
svc.fit(x_train_v, y_train)

In [21]:
y_pred = svc.predict(x_val_v)

In [22]:
score = accuracy_score(y_val, y_pred)
score

0.8836206896551724

In [23]:
# DecisionTree

tree = DecisionTreeClassifier()
tree.fit(x_train_v, y_train)

In [24]:
y_pred = tree.predict(x_val_v)

In [25]:
score = accuracy_score(y_val, y_pred)
score

0.8836206896551724

In [26]:
# All the above model Logistic Regression, SVC and Decission Tree Classifier gives 100% accuracy. So we can choose any model 
# I am going to choose Decission Tree

### Test Prediction

In [27]:
test_predict = tree.predict(x_test_v)

In [28]:
score = accuracy_score(y_test, test_predict)
score

0.9356223175965666

#### Side by side Checking Actual vs Predicted

In [29]:
side_by_side = pd.DataFrame(y_test)
side_by_side['Predicted_voice_passive'] = test_predict

In [30]:
side_by_side

Unnamed: 0,voice_Passive,Predicted_voice_passive
104,0,0
409,1,1
1016,0,0
956,0,0
925,1,1
...,...,...
231,1,1
740,0,0
966,0,0
1034,0,0


# Saving files for deployment

In [31]:
pickle.dump(Vectorizer, open('vectorizer.pkl', 'wb'))

In [32]:
pickle.dump(tree, open('model.pkl', 'wb'))


### Custom Testing

In [33]:
vect = pickle.load(open('vectorizer.pkl', 'rb'))
model = pickle.load(open('model.pkl', 'rb'))

In [34]:
text1 = 'By the time the final decision had been reached by the committee after hours of deliberation, multiple proposals had already been dismissed as impractical by the panel of experts.'
text2 = 'Having been extensively reviewed and debated by the advisory board for weeks, the revised strategic plan, which was originally drafted by the senior analysts, had ultimately been approved by the executive committee after numerous amendments were incorporated.'
text3 = 'The author wrote the book.'

In [35]:
text_list = [text1, text2, text3]

In [36]:
def testing(text):
    text_v = vect.transform([text])
    result = model.predict(text_v)

    if result == 1:
        return f'{text}  :  Passive'
    else:
        return f'{text}  :  Active'

In [37]:
for text in text_list:
    print(testing(text))

By the time the final decision had been reached by the committee after hours of deliberation, multiple proposals had already been dismissed as impractical by the panel of experts.  :  Passive
Having been extensively reviewed and debated by the advisory board for weeks, the revised strategic plan, which was originally drafted by the senior analysts, had ultimately been approved by the executive committee after numerous amendments were incorporated.  :  Passive
The author wrote the book.  :  Passive


In [38]:
# This Predicted Accurately