In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [2]:
df = pd.read_csv("/content/draft_50000.csv")
df = df.rename(columns = {'text': 'data'}, inplace = False)
df = df.rename(columns = {'class': 'Label'}, inplace = False)
df.head()

Unnamed: 0.1,Unnamed: 0,data,Label
0,51297,get crush guy definitely way old laugh literal...,0.0
1,24705,go to july 2018i hope 8 month enough change mind,1.0
2,185969,want live anymore 23 right thinking end life p...,1.0
3,201675,every time get period want kill already depres...,1.0
4,52701,story incomplete similar story op case dad spe...,1.0


In [3]:
df['data'] = df['data'].str.lower().str.replace('[^\w\s]', '', regex=True)
df.dropna(subset=['data', 'Label'], inplace=True)


In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['.', ',', '!', '?', ';', ':'])

def remove_stopwords(text):
  """
  Removes stopwords from a given text.

  Args:
    text: A string containing the text.

  Returns:
    A string with stopwords removed.
  """
  tokens = nltk.word_tokenize(text)
  filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
  return ' '.join(filtered_tokens)

df['data'] = df['data'].apply(remove_stopwords)
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0.1,Unnamed: 0,data,Label
0,51297,get crush guy definitely way old laugh literal...,0.0
1,24705,go july 2018i hope 8 month enough change mind,1.0
2,185969,want live anymore 23 right thinking end life p...,1.0
3,201675,every time get period want kill already depres...,1.0
4,52701,story incomplete similar story op case dad spe...,1.0


In [5]:
from sklearn.model_selection import train_test_split
X = df.data
y = df.Label
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 1)

In [6]:
# Split the dataset into features and target variable
X = df['data']
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that includes TF-IDF vectorizer and SVM
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('svm', SVC(kernel='linear', C=1, random_state=42))
])

# Train the SVM model using the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:",accuracy)
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.8942307692307693

Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.92      0.89       241
         1.0       0.93      0.87      0.90       279

    accuracy                           0.89       520
   macro avg       0.89      0.90      0.89       520
weighted avg       0.90      0.89      0.89       520

