In [1]:
import pandas as pd
import numpy as np


In [2]:
dataset=pd.read_csv("Flirt_Prediction_Dataset.csv",header=None,on_bad_lines='skip',encoding="utf8")
   


In [3]:
dataset.columns=["messages","flirt_class"]

In [4]:
dataset["flirt_class"].unique

<bound method Series.unique of 0     label
1         0
2         1
3         1
4         0
      ...  
95        0
96        1
97        1
98        1
99        1
Name: flirt_class, Length: 100, dtype: object>

In [5]:
dataset

Unnamed: 0,messages,flirt_class
0,message,label
1,Are we still on for tomorrow?,0
2,You're more fun than my favorite song.,1
3,I had a dream about you last night.,1
4,Need to catch up on some sleep.,0
...,...,...
95,Please let me know when you’re free.,0
96,You always know just what to say.,1
97,You make my heart skip a beat.,1
98,You’re exactly my type.,1


In [6]:
dataset=dataset.drop(0,axis=0)

In [7]:
dataset["flirt_class"].dtype

dtype('O')

In [8]:
dataset["flirt_class"]=dataset["flirt_class"].astype("int64")

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(stop_words="english",max_df=.95,min_df=2)

In [10]:
messg_mat=cv.fit_transform(dataset["messages"])

In [11]:
messg_mat.shape

(99, 39)

In [12]:

independent=messg_mat
dependent=dataset["flirt_class"]


In [13]:
from sklearn.model_selection import train_test_split as ts
x_train,x_test,y_train,y_test=ts(independent,dependent,test_size=.2,random_state=42)

In [14]:
y_train

50    1
71    0
69    1
16    0
40    0
     ..
61    0
72    0
15    1
93    0
52    0
Name: flirt_class, Length: 79, dtype: int64

# now lets create models using different classification algorithms

In [15]:
# different classification algorithms are logistic regression,naive bayes,knn,svm,decision tree,randomforest,XGBoost / Gradient Boost

In [16]:
#!pip install xgboost

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,accuracy_score

In [18]:
lg = LogisticRegression(C=0.1, solver='liblinear', max_iter=500, random_state=42)
nb = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
svc = SVC(C=0.1, kernel='linear', gamma='scale', probability=True, random_state=42)
rf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, random_state=42)
dt = DecisionTreeClassifier(
    criterion='entropy',          # 'entropy' might work better for some text tasks (information gain)
    max_depth=5,                  # Decreasing depth to reduce overfitting
    min_samples_split=4,          # Minimum samples required to split an internal node
    min_samples_leaf=2,           # Minimum samples required at the leaf node
    max_features='auto',          # Consider all features for best splitting
    random_state=42 )
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto', leaf_size=30)
xgb = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05, random_state=42, use_label_encoder=False, eval_metric='logloss')


In [19]:

acc=[]
algorithm=[lg,nb,svc,rf,dt,knn,xgb]
def flirtidentif(model,x_train,x_test,y_test):
    for model in algorithm:
        model.fit(x_train,y_train)
        y_pred=model.predict(x_test)
        ac_score=accuracy_score(y_pred,y_test)
        acc.append(ac_score)
    
    return model,acc
    
    

In [20]:
model,acc=flirtidentif(lg,x_train,x_test,y_test)
acc

[0.45, 0.55, 0.35, 0.55, 0.4, 0.5, 0.7]

In [21]:
dataset["flirt_class"].dtype

dtype('int64')

# lets tabularise these results 

In [22]:
columns=["accuracy score"]
ind_list=["LogisticRegression","MultinomialNB","KNeighborsClassifier","SVC","DecisionTreeClassifier","RandomForestClassifier","XGBClassifier"]
acc_table=pd.DataFrame(index=["LogisticRegression","MultinomialNB","KNeighborsClassifier","SVC","DecisionTreeClassifier","RandomForestClassifier","XGBClassifier"],columns=["accuracy score"])

acc_table




Unnamed: 0,accuracy score
LogisticRegression,
MultinomialNB,
KNeighborsClassifier,
SVC,
DecisionTreeClassifier,
RandomForestClassifier,
XGBClassifier,


In [23]:
acc

[0.45, 0.55, 0.35, 0.55, 0.4, 0.5, 0.7]

In [24]:
enumerate(ind_list)

<enumerate at 0x1c03b4d7678>

In [25]:
for index,name in enumerate(ind_list):
    acc_table.loc[name,columns]=acc[index]


In [26]:
acc_table

Unnamed: 0,accuracy score
LogisticRegression,0.45
MultinomialNB,0.55
KNeighborsClassifier,0.35
SVC,0.55
DecisionTreeClassifier,0.4
RandomForestClassifier,0.5
XGBClassifier,0.7


In [27]:
import pickle

In [28]:
pickle.dump(cv,open("countvectorizer.sav","wb"))

In [29]:
pickle.dump(rf,open("model_rf.sav","wb"))