In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
df = pd.read_csv("dreaddit-train.csv")

In [5]:
df=df[['text','label']]

In [38]:
df

Unnamed: 0,text,label
0,said felt way sugget go rest trigger ahead you...,1
1,hey rassist sure right place post goe im curre...,0
2,mom hit newspap shock would know dont like pla...,1
3,met new boyfriend amaz kind sweet good student...,1
4,octob domest violenc awar month domest violenc...,1
...,...,...
2833,week ago precious ignor jan happi year preciou...,0
2834,dont abil cope anymor im tri lot thing trigger...,1
2835,case first time your read post look peopl will...,0
2836,find normal good relationship main problem see...,0


In [6]:
import nltk
import string
import re
from nltk.corpus import stopwords

stop_words_list = stopwords.words("english")

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\d", " ", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"https?://S+|www.\.\S+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    words = [word for word in words if not word in stop_words_list]
    words = [re.sub(r"(.)\1{1,}", r"\1\1", word) for word in words]
    words = [word.strip() for word in words if len(word.strip()) > 1]
    text = " ".join(words)
    return text

In [7]:
df['text']=df['text'].apply(preprocess_text)

In [8]:
df["text"]

0       said felt way suggeted go rest trigger ahead y...
1       hey rassistance sure right place post goes im ...
2       mom hit newspaper shocked would knows dont lik...
3       met new boyfriend amazing kind sweet good stud...
4       october domestic violence awareness month dome...
                              ...                        
2833    week ago precious ignored jan happy year preci...
2834    dont ability cope anymore im trying lot things...
2835    case first time youre reading post looking peo...
2836    find normal good relationship main problem see...
2837    talking mom morning said sister trauma worse m...
Name: text, Length: 2838, dtype: object

In [9]:
from nltk import SnowballStemmer

stemmer = SnowballStemmer("english")

def stemming(text):
    stemmed_text = ""
    for word in text.split():
        stem = stemmer.stem(word)
        stemmed_text += stem
        stemmed_text += " "
        
    stemmed_text = stemmed_text.strip()
    return stemmed_text

In [10]:
df['text']=df['text'].apply(stemming)

In [11]:
df['text']

0       said felt way sugget go rest trigger ahead you...
1       hey rassist sure right place post goe im curre...
2       mom hit newspap shock would know dont like pla...
3       met new boyfriend amaz kind sweet good student...
4       octob domest violenc awar month domest violenc...
                              ...                        
2833    week ago precious ignor jan happi year preciou...
2834    dont abil cope anymor im tri lot thing trigger...
2835    case first time your read post look peopl will...
2836    find normal good relationship main problem see...
2837    talk mom morn said sister trauma wors mine did...
Name: text, Length: 2838, dtype: object

In [12]:
X = df["text"]
y = df["label"]

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_scaled = tfidf.fit_transform(X)

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [29]:
predictions = logmodel.predict(X_test)

In [30]:
from sklearn.metrics import classification_report

In [31]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.69      0.76      0.72       392
           1       0.78      0.71      0.74       460

    accuracy                           0.73       852
   macro avg       0.73      0.74      0.73       852
weighted avg       0.74      0.73      0.73       852



In [42]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [43]:
# Define hyperparameters to tune
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}


# Grid search cross-validation to find the best hyperparameters
grid_search = GridSearchCV(logmodel, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train,y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the model with best hyperparameters on the test set
best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on test set:", accuracy)

Best Hyperparameters: {'C': 1, 'penalty': 'l2'}
Accuracy on test set: 0.7335680751173709


In [44]:
import pickle





with open('logmodel.pkl', 'wb') as f:
    pickle.dump(logmodel, f)

# Save CountVectorizer to a pickle file
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [57]:
new_review="met new boyfriend amaz kind sweet good student like thing famili like dont feel passion rush felt ex truth start go boyfriend secret saw ex time see realli didnt feel noth disgust didnt even want touch feel bad didnt want still kinda realiz felt noth love relat ok hurt knew date boy even beg stay cours problem im boyfriend dont feel like love like thing kinda feel new love feel ok catch think ex time time rememb good thing drive crazi know see wont feel way love mind make think still recent found girl actual enjoy experi got mad hurt know dont right feel way felt betray still feel way gross"

# Preprocess the new review
preprocessed_review = preprocess_text(new_review)

# Transform the preprocessed review into TF-IDF features
tfidf_features = tfidf.transform([preprocessed_review])

# Make predictions using the trained logistic regression model
predicted_label = logmodel.predict(tfidf_features)

# Print the predicted label
print("Predicted Label:", predicted_label)


Predicted Label: [1]


In [32]:
from sklearn import svm

In [33]:
model = svm.SVC(kernel='linear', C=1.0)

In [34]:
model.fit(X_train, y_train)

In [35]:
y_pred = model.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7276995305164319
              precision    recall  f1-score   support

           0       0.68      0.76      0.72       392
           1       0.78      0.70      0.73       460

    accuracy                           0.73       852
   macro avg       0.73      0.73      0.73       852
weighted avg       0.73      0.73      0.73       852



In [39]:
# Importing necessary libraries
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Create a k-NN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.642018779342723
