In [8]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


In [9]:
data = pd.read_csv("./Sheet_1.csv",encoding= "latin1" )
data.head()

Unnamed: 0,response_id,class,response_text,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,response_1,not_flagged,I try and avoid this sort of conflict,,,,,
1,response_2,flagged,Had a friend open up to me about his mental ad...,,,,,
2,response_3,flagged,I saved a girl from suicide once. She was goin...,,,,,
3,response_4,not_flagged,i cant think of one really...i think i may hav...,,,,,
4,response_5,not_flagged,Only really one friend who doesn't fit into th...,,,,,


In [10]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   response_id    80 non-null     object 
 1   class          80 non-null     object 
 2   response_text  80 non-null     object 
 3   Unnamed: 3     2 non-null      object 
 4   Unnamed: 4     0 non-null      float64
 5   Unnamed: 5     1 non-null      object 
 6   Unnamed: 6     0 non-null      float64
 7   Unnamed: 7     1 non-null      object 
dtypes: float64(2), object(6)
memory usage: 5.1+ KB


In [11]:
#checking missing values
null_values = data.isnull().sum()
print(null_values)

response_id       0
class             0
response_text     0
Unnamed: 3       78
Unnamed: 4       80
Unnamed: 5       79
Unnamed: 6       80
Unnamed: 7       79
dtype: int64


In [12]:
#checking duplicated values
duplicate_values = data[data.duplicated()]
print(duplicate_values)

Empty DataFrame
Columns: [response_id, class, response_text, Unnamed: 3, Unnamed: 4, Unnamed: 5, Unnamed: 6, Unnamed: 7]
Index: []


In [13]:
# feature engineering :  
#Dropping some of the redundant features
to_drop = ["Unnamed: 3", "Unnamed: 4","Unnamed: 5", "Unnamed: 6", "Unnamed: 7"]
data = data.drop(to_drop, axis=1)
data.head()


Unnamed: 0,response_id,class,response_text
0,response_1,not_flagged,I try and avoid this sort of conflict
1,response_2,flagged,Had a friend open up to me about his mental ad...
2,response_3,flagged,I saved a girl from suicide once. She was goin...
3,response_4,not_flagged,i cant think of one really...i think i may hav...
4,response_5,not_flagged,Only really one friend who doesn't fit into th...


In [14]:
# Preprocessing function to clean the text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # Remove special characters
    return text

In [15]:
# Apply preprocessing to the 'response_text' column
df = pd.DataFrame(data)
df['processed_text'] = df['response_text'].apply(preprocess_text)



In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'], df['class'], test_size=0.2, random_state=42
)


In [17]:
# Create a TF-IDF vectorizer to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [18]:
# Initialize and train the Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)


In [19]:
# Make predictions on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6875

Classification Report:
               precision    recall  f1-score   support

     flagged       0.20      0.50      0.29         2
 not_flagged       0.91      0.71      0.80        14

    accuracy                           0.69        16
   macro avg       0.55      0.61      0.54        16
weighted avg       0.82      0.69      0.74        16

