# Implementation of SVM

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# Path to the folder containing CSV files
folder_path = "data/"

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        # Read each CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

In [3]:
combined_df.drop(columns=['author', 'date', 'post'], inplace=True)

In [4]:
# from the combined_df, create a new dataframe in which the 'subreddit' column has only the values 'addiction', 'adhd', 'alcoholism', 'anxiety', 'autism', 'bpd', 'depression', 'lonely', 'ptsd', 'schizophrenia', 'suicidewatch'. Name this new dataframe as 'new_df'
combined_df = combined_df[combined_df['subreddit'].isin(['addiction', 'adhd', 'alcoholism', 'anxiety', 'autism', 'bpd', 'depression', 'lonely', 'ptsd', 'schizophrenia', 'suicidewatch'])] 

In [5]:
combined_df.head()

Unnamed: 0,subreddit,automated_readability_index,coleman_liau_index,flesch_kincaid_grade_level,flesch_reading_ease,gulpease_index,gunning_fog_index,lix,smog_index,wiener_sachtextformel,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
31382,adhd,5.400816,6.50584,5.555245,81.416541,68.047619,9.145306,31.706803,9.3871,2.832296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.095341,0.0,0.0,0.086429
31383,adhd,2.980698,5.751419,4.789892,76.862769,79.896552,8.314655,27.68319,9.017664,2.933491,...,0.0,0.0,0.0,0.099106,0.0,0.0,0.290114,0.0,0.0,0.0
31384,adhd,5.136889,6.746474,6.981667,69.0525,69.148148,10.733333,34.240741,10.793553,4.375385,...,0.0,0.0,0.0,0.0,0.117894,0.0,0.0,0.126925,0.0,0.0
31385,adhd,2.841137,5.750767,4.59774,76.904579,82.383459,8.13703,27.109492,8.841846,2.895752,...,0.0,0.0,0.0,0.0,0.0,0.0,0.303759,0.0,0.0,0.0
31386,adhd,5.781923,7.779519,6.42359,72.163077,68.102564,9.302564,37.358974,9.725611,3.969113,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
combined_df.shape

(374499, 347)

In [7]:
class SVM:

    def __init__(self, C = 1.0):
        # C = error term
        self.C = C
        self.w = 0
        self.b = 0

    # Hinge Loss Function / Calculation
    def hingeloss(self, w, b, x, y):
        # Regularizer term
        reg = 0.5 * np.sum(w * w)

        y = y.reshape(-1, 1)

        # Calculate the optimization term for each data point
        opt_term = y * (np.dot(w, x.T) + b)


        # Calculate the loss for each data point
        loss = reg + self.C * np.sum(np.maximum(0, 1 - opt_term))

        return loss


    def fit(self, X, Y, batch_size=100, learning_rate=0.001, epochs=1000):
        # The number of features in X
        number_of_features = X.shape[1]

        # The number of Samples in X
        number_of_samples = X.shape[0]

        c = self.C

        # Creating ids from 0 to number_of_samples - 1
        ids = np.arange(number_of_samples)

        # Shuffling the samples randomly
        np.random.shuffle(ids)

        # creating an array of zeros
        w = np.zeros((1, number_of_features))
        b = 0
        losses = []

        # Gradient Descent 
        for i in range(epochs):
            # Calculating the Hinge Loss
            l = self.hingeloss(w, b, X, Y)

            # Appending all losses 
            losses.append(l)
            
            # Starting from 0 to the number of samples with batch_size as interval
            for batch_initial in range(0, number_of_samples, batch_size):
                gradw = 0
                gradb = 0

                for j in range(batch_initial, batch_initial+ batch_size):
                    if j < number_of_samples:
                        x = ids[j]
                        ti = Y[x] * (np.dot(w, X[x].T) + b)

                        if (ti > 1).all():
                            gradw += 0
                            gradb += 0
                        else:
                            # Calculating the gradients

                            #w.r.t w 
                            gradw += c * Y[x] * X[x]
                            # w.r.t b
                            gradb += c * Y[x]

                # Updating weights and bias
                w = w - learning_rate * w + learning_rate * gradw
                b = b + learning_rate * gradb
        
        self.w = w
        self.b = b

        return self.w, self.b, losses


    def predict(self, X):
        
        prediction = np.dot(X, self.w[0]) + self.b # w.x + b
        return np.sign(prediction)

In [8]:
from sklearn.model_selection import train_test_split

X = combined_df.drop('subreddit', axis=1)  # Features
y = combined_df['subreddit']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(X_train.shape)
print(y_train.shape)

print(type(X_train))
print(type(y_train))

(20, 346)
(20,)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [11]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Encode the categorical target variable (y_train) using OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))

# Step 2: Flatten the encoded labels
y_train_encoded = y_train_encoded.argmax(axis=1)

# Step 3: Train the SVM model
svm_model = SVC(kernel='linear', C=1.0, probability=True)  # You can choose different kernel and hyperparameters
svm_model.fit(X_train, y_train_encoded)

# Step 4: Evaluate the model on the test data
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1)).argmax(axis=1)
y_pred = svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print("Accuracy:", accuracy)

# Print classification report for detailed metrics
print(classification_report(y_test_encoded, y_pred))




ValueError: Found unknown categories ['lonely', 'autism', 'alcoholism'] in column 0 during transform