## Text Classification for Active vs. Passive Voice Detection

In [148]:
# Packages

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [149]:
# Data

df = pd.read_csv('active_passive dataset.csv')

In [150]:
data=df
df.head()


Unnamed: 0,id,sentence,voice
0,1,The chef prepares the meal.,Active
1,2,The teacher explains the lesson clearly.,Active
2,3,The gardener waters the plants every morning.,Active
3,4,The kids play soccer in the park.,Active
4,5,The author wrote a thrilling novel.,Active


### Pre-Processing

In [151]:
df.isnull().sum()   

id          0
sentence    0
voice       0
dtype: int64

In [152]:
# Remove punctuation and convert to lowercase

def clean(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

In [153]:
data['sentence'] = data['sentence'].apply(clean)

In [154]:
data['sentence'].head()

0                      the chef prepares the meal
1         the teacher explains the lesson clearly
2    the gardener waters the plants every morning
3                the kids play soccer in the park
4              the author wrote a thrilling novel
Name: sentence, dtype: object

In [155]:
# Create Dummy Variable

data = pd.get_dummies(data, columns=['voice'], drop_first=True)

In [156]:
data.head(1)

Unnamed: 0,id,sentence,voice_Passive
0,1,the chef prepares the meal,False


In [157]:
data['voice_Passive'] = data['voice_Passive'].astype(int)

In [158]:
data.head(1)

Unnamed: 0,id,sentence,voice_Passive
0,1,the chef prepares the meal,0


In [159]:
data.tail(1)

Unnamed: 0,id,sentence,voice_Passive
39,40,the assignment is submitted on time by the stu...,1


### Train, Validate and Test Spliting

In [160]:
# Declair x and y

x = data['sentence']
y = data['voice_Passive']

In [161]:
# Split Data

x_train, x_tmp, y_train, y_tmp = train_test_split(x, y, train_size=0.6, random_state= 51)

x_val, x_test, y_val, y_test = train_test_split(x_tmp, y_tmp, test_size=0.5, random_state= 51)

### Vectorization

In [162]:
Vectorizer = CountVectorizer()

In [163]:
x_train_v = Vectorizer.fit_transform(x_train)
x_val_v = Vectorizer.transform(x_val)
x_test_v = Vectorizer.transform(x_test)

### Classifier Models

In [164]:
# Logistic Regression

logit = LogisticRegression()
logit.fit(x_train_v, y_train)

In [165]:
y_pred = logit.predict(x_val_v)

In [166]:
score = accuracy_score(y_val, y_pred)
score

1.0

In [167]:
# Support Vector Machine

svc = SVC()
svc.fit(x_train_v, y_train)

In [168]:
y_pred = svc.predict(x_val_v)

In [169]:
score = accuracy_score(y_val, y_pred)
score

1.0

In [170]:
# DecisionTree

tree = DecisionTreeClassifier()
tree.fit(x_train_v, y_train)

In [171]:
y_pred = tree.predict(x_val_v)

In [172]:
score = accuracy_score(y_val, y_pred)
score

1.0

In [173]:
# All the above model Logistic Regression, SVC and Decission Tree Classifier gives 100% accuracy. So we can choose any model 
# I am going to choose Decission Tree

### Test Prediction

In [174]:
test_predict = tree.predict(x_test_v)

In [175]:
score = accuracy_score(y_test, test_predict)
score

1.0

#### Side by side Checking Actual vs Predicted

In [176]:
side_by_side = pd.DataFrame(y_test)
side_by_side['Predicted_voice_passive'] = test_predict

In [177]:
side_by_side

Unnamed: 0,voice_Passive,Predicted_voice_passive
11,0,0
2,0,0
14,0,0
18,0,0
34,1,1
15,0,0
0,0,0
23,1,1


### Custom Testing

In [178]:
vect = pickle.load(open('vectorizer.pkl', 'rb'))
model = pickle.load(open('model.pkl', 'rb'))

In [179]:
text1 = 'I programmed this app'
text2 = 'This app is programmed by me'

In [180]:
text_list = [text1, text2]

In [181]:
def testing(text):
    text_v = vect.transform([text])
    result = model.predict(text_v)

    if result == 1:
        return f'{text}  :  Passive'
    else:
        return f'{text}  :  Active'

In [182]:
for text in text_list:
    print(testing(text))

I programmed this app  :  Active
This app is programmed by me  :  Passive


In [183]:
# This Predicted Accurately