In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [2]:
# Loading the dataset
dataset = pd.read_csv('Flrt & n.flrt msg.csv', index_col=None)
dataset

Unnamed: 0,Message,Label
0,I am fortunate to have such a hot person in my...,Flirt Message
1,Just saying... I'm feeling a bit naughty.,Flirt Message
2,Don't dismiss your partner's desire.,Flirt Message
3,"Come on, angel, my hearts are burning...",Flirt Message
4,"I want to have a shower, would you like to joi...",Flirt Message
...,...,...
199,Happy to hear.,Non Flirt Message
200,Good for u.,Non Flirt Message
201,Am not feeling well.,Non Flirt Message
202,Who told u.,Non Flirt Message


In [3]:
y = dataset["Label"]

In [4]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset['Message'], y, test_size=0.33, random_state=53)

In [5]:
# Defining vectorizers
count_vectorizer = CountVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [6]:
# Fitting and transform using CountVectorizer
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

# Fitting and transform using TfidfVectorizer
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [7]:
# Models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    'Passive Aggressive Classifier': PassiveAggressiveClassifier(),
}

In [8]:
# Training and evaluating each model using CountVectorizer
for model_name, model in models.items():
    print(f"\n{model_name} - CountVectorizer")
    model.fit(count_train, y_train)
    pred = model.predict(count_test)
    score = metrics.accuracy_score(y_test, pred)
    print("Accuracy:", score)
    cm = metrics.confusion_matrix(y_test, pred, labels=['Flirt Message', 'Non Flirt Message'])
    print("Confusion Matrix:\n", cm)
    report = classification_report(y_test, pred)
    print("Classification Report:\n", report)


Logistic Regression - CountVectorizer
Accuracy: 0.7941176470588235
Confusion Matrix:
 [[17  7]
 [ 7 37]]
Classification Report:
                    precision    recall  f1-score   support

    Flirt Message       0.71      0.71      0.71        24
Non Flirt Message       0.84      0.84      0.84        44

         accuracy                           0.79        68
        macro avg       0.77      0.77      0.77        68
     weighted avg       0.79      0.79      0.79        68


K-Nearest Neighbors - CountVectorizer
Accuracy: 0.6470588235294118
Confusion Matrix:
 [[ 0 24]
 [ 0 44]]
Classification Report:
                    precision    recall  f1-score   support

    Flirt Message       0.00      0.00      0.00        24
Non Flirt Message       0.65      1.00      0.79        44

         accuracy                           0.65        68
        macro avg       0.32      0.50      0.39        68
     weighted avg       0.42      0.65      0.51        68


Support Vector Machine - 

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

In [9]:
# Training and evaluating each model using TfidfVectorizer
for model_name, model in models.items():
    print(f"\n{model_name} - TfidfVectorizer")
    model.fit(tfidf_train, y_train)
    pred = model.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    print("Accuracy:", score)
    cm = metrics.confusion_matrix(y_test, pred, labels=['Flirt Message', 'Non Flirt Message'])
    print("Confusion Matrix:\n", cm)
    report = classification_report(y_test, pred)
    print("Classification Report:\n", report)


Logistic Regression - TfidfVectorizer
Accuracy: 0.5441176470588235
Confusion Matrix:
 [[21  3]
 [28 16]]
Classification Report:
                    precision    recall  f1-score   support

    Flirt Message       0.43      0.88      0.58        24
Non Flirt Message       0.84      0.36      0.51        44

         accuracy                           0.54        68
        macro avg       0.64      0.62      0.54        68
     weighted avg       0.70      0.54      0.53        68


K-Nearest Neighbors - TfidfVectorizer
Accuracy: 0.7794117647058824
Confusion Matrix:
 [[12 12]
 [ 3 41]]
Classification Report:
                    precision    recall  f1-score   support

    Flirt Message       0.80      0.50      0.62        24
Non Flirt Message       0.77      0.93      0.85        44

         accuracy                           0.78        68
        macro avg       0.79      0.72      0.73        68
     weighted avg       0.78      0.78      0.76        68


Support Vector Machine - 

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

# Best Model- Based on F1 score & CM

Decision Tree - CountVectorizer
Accuracy: 0.8088235294117647
Confusion Matrix:
 [[14 10]
 [ 3 41]]
Classification Report:
                    precision    recall  f1-score   support

    Flirt Message       0.82      0.58      0.68        24
Non Flirt Message       0.80      0.93      0.86        44

         accuracy                           0.81        68
        macro avg       0.81      0.76      0.77        68
     weighted avg       0.81      0.81      0.80        68

 
       &
       
       
Passive Aggressive Classifier - CountVectorizer
Accuracy: 0.8088235294117647
Confusion Matrix:
 [[15  9]
 [ 4 40]]
Classification Report:
                    precision    recall  f1-score   support

    Flirt Message       0.79      0.62      0.70        24
Non Flirt Message       0.82      0.91      0.86        44

         accuracy                           0.81        68
        macro avg       0.80      0.77      0.78        68
     weighted avg       0.81      0.81      0.80        68
     
Both models seems to perform well.

But am choosing Decision Tree - CountVectorizer as the best model.

# Saving Best Model

In [10]:
# Assuming the best model is Decision Tree - CountVectorizer
best_model = DecisionTreeClassifier()
best_model.fit(count_train, y_train)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [11]:
import pickle
filename="Finalized_Model_Random_Forest.sav"
pickle.dump(best_model,open(filename,'wb'))

In [12]:
loaded_model=pickle.load(open("Finalized_Model_Random_Forest.sav",'rb'))

In [13]:
dataset["Message"][3]

'Come on, angel, my hearts are burning...'

In [14]:
count_train[[3]]

<1x305 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [15]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)

In [16]:
print(X_train.index)

Index([ 41, 131,   0,  60, 115,  37,  95, 190,  15,  49,
       ...
        88,  48, 127,  63, 150, 189, 165, 117,  29,  25],
      dtype='int64', length=136)


In [17]:
prediction = best_model.predict(count_train[[3]])
print("Predicted Class:", prediction)

Predicted Class: ['Flirt Message']
