Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [2]:
# loading the data from csv file to a pandas Dataframe
raw_data = pd.read_csv('dataset_test1.csv')

In [3]:
print(raw_data)

                                               answer  MU_score
0   To find the average speed of the train for the...         0
1   Agile software development is a modern approac...         0
2   The subject you've brought up isn't about psyc...         0
3   King Shantanu and Satyavati had two sons. Thei...         0
4   Choosing an artistic career or a business care...         0
..                                                ...       ...
75  Conspiracy theories about the COVID-19 pandemi...         0
76  Your question isn't centered on psychological ...         1
77  Your question doesn't fall under the mental he...         1
78               The next number in the series is 56.         0
79  Supervised and unsupervised learning are two d...         0

[80 rows x 2 columns]


In [4]:
# printing the first 5 rows of the dataframe
raw_data.head()

Unnamed: 0,answer,MU_score
0,To find the average speed of the train for the...,0
1,Agile software development is a modern approac...,0
2,The subject you've brought up isn't about psyc...,0
3,King Shantanu and Satyavati had two sons. Thei...,0
4,Choosing an artistic career or a business care...,0


In [5]:
# checking the number of rows and columns in the dataframe
raw_data.shape

(80, 2)

Separating the data as texts and label

In [6]:
X = raw_data['answer']

Y = raw_data['MU_score']

In [7]:
print(X)

0     To find the average speed of the train for the...
1     Agile software development is a modern approac...
2     The subject you've brought up isn't about psyc...
3     King Shantanu and Satyavati had two sons. Thei...
4     Choosing an artistic career or a business care...
                            ...                        
75    Conspiracy theories about the COVID-19 pandemi...
76    Your question isn't centered on psychological ...
77    Your question doesn't fall under the mental he...
78                 The next number in the series is 56.
79    Supervised and unsupervised learning are two d...
Name: answer, Length: 80, dtype: object


In [8]:
print(Y)

0     0
1     0
2     0
3     0
4     0
     ..
75    0
76    1
77    1
78    0
79    0
Name: MU_score, Length: 80, dtype: int64


Splitting the data into training data & test data

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [10]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(80,)
(64,)
(16,)


Feature Extraction

In [11]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True) 

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [12]:
print(X_train)

55    I'm really sorry to hear that you're feeling t...
13    This inquiry seems separate from mental health...
50    This doesn't appear to be a mental wellness in...
66    The idea that the moon landing was faked is a ...
31    This inquiry doesn't relate to mental wellness...
                            ...                        
0     To find the average speed of the train for the...
72    To find out how many more days it will take fo...
56    I understand that you're looking for a balance...
3     King Shantanu and Satyavati had two sons. Thei...
24    I'm sorry, but I cannot assist with any illega...
Name: answer, Length: 64, dtype: object


In [13]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2356 stored elements and shape (64, 1334)>
  Coords	Values
  (0, 986)	0.05613186028716412
  (0, 1129)	0.052825325060075694
  (0, 569)	0.052825325060075694
  (0, 491)	0.04586082374328829
  (0, 1311)	0.1503710671963804
  (0, 542)	0.05613186028716412
  (0, 980)	0.05613186028716412
  (0, 1131)	0.052825325060075694
  (0, 721)	0.12342595439655775
  (0, 964)	0.06039472560933596
  (0, 1056)	0.1328057936620799
  (0, 383)	0.33201448415519974
  (0, 624)	0.21130130024030278
  (0, 607)	0.03469145316069902
  (0, 928)	0.10024737813092027
  (0, 1066)	0.18118417682800786
  (0, 747)	0.14351846513649733
  (0, 1038)	0.12078945121867192
  (0, 210)	0.11226372057432824
  (0, 1186)	0.06640289683103995
  (0, 142)	0.05613186028716412
  (0, 539)	0.1328057936620799
  (0, 194)	0.1328057936620799
  (0, 608)	0.06640289683103995
  (0, 426)	0.11226372057432824
  :	:
  (62, 208)	0.39624486719089386
  (62, 1301)	0.39624486719089386
  (63, 1129)	0.205526555319

Training the Model

Logistic Regression

In [14]:
model = LogisticRegression()

In [15]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

In [16]:
import joblib

# Step 5: Save the trained model and vectorizer
joblib.dump(model, 'spam_classifier_model.pkl')
joblib.dump(feature_extraction, 'vectorizer.pkl')

['vectorizer.pkl']

Evaluating the trained model

In [17]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [18]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.953125


In [19]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [20]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9375


Building a Predictive System

In [21]:
ai_response = ["""
It is illegal to drink alchohol
"""]

# convert text to feature vectors
input_data_features = feature_extraction.transform(ai_response)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('AI Pass')

else:
  print('AI Fail')

[0]
AI Fail
