# Implementation of SVM

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
#  Load the data from combined_df.csv
combined_df = pd.read_csv('./combined_df.csv')

In [3]:
combined_df.drop('post', inplace=True, axis=1)

In [4]:
# randomly drop 80% of the data
combined_df = combined_df.sample(frac=0.2, random_state=1)
print(combined_df.shape)

(74900, 347)


In [None]:
# Tokenize and clean text data (you can use regular expressions for more advanced cleaning)
combined_df['post'] = combined_df['post'].apply(lambda x: x.lower())  # Convert to lowercase

In [None]:
import nltk

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    # Tokenization and removing punctuation
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.isalnum()]
    
    # Removing stop words and lemmatization
    stop_words = set(stopwords.words("english"))
    words = [WordNetLemmatizer().lemmatize(word) for word in words if word not in stop_words]
    
    return " ".join(words)
    
combined_df['cleaned_posts'] = combined_df['post'].apply(preprocess_text)

In [None]:
# You can load pre-trained word embeddings using libraries like gensim
from gensim.models import Word2Vec

# Train Word2Vec model on your preprocessed text data
word2vec_model = Word2Vec(sentences=[text.split() for text in combined_df['cleaned_posts']], vector_size=100, window=5, min_count=1, sg=1)

In [None]:
# Create a function to generate Word2Vec embeddings for a given text
def get_word2vec_features(text, model):
    words = text.split()
    # Initialize an empty vector
    feature_vector = np.zeros((model.vector_size,), dtype="float32")
    num_words = 0
    # Iterate over each word in the text
    for word in words:
        if word in model.wv:
            num_words += 1
            # Add the word's vector to the feature_vector
            feature_vector = np.add(feature_vector, model.wv[word])
    # Divide the result by the number of words to get the average
    if num_words != 0:
        feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

# Generate Word2Vec features for each post in the DataFrame
combined_df['word2vec_features'] = combined_df['cleaned_posts'].apply(lambda post: get_word2vec_features(post, word2vec_model))

In [None]:
word2vec_arrays = np.array(list(combined_df['word2vec_features']))

In [None]:
shape_word2vec = word2vec_arrays.shape

# Create an empty array to accommodate the Word2Vec data
X_combined = np.empty((shape_word2vec[0], shape_word2vec[1]))

# Copy data from the original Word2Vec array to the combined array
X_combined[:, :shape_word2vec[1]] = word2vec_arrays


In [5]:
from sklearn.model_selection import train_test_split

X = combined_df.drop(['subreddit'],  axis=1)  # Features
y = combined_df['subreddit']  # Target

In [6]:
# Run PCA on X_train

from sklearn.decomposition import PCA

pca = PCA(n_components=5)

X_pca = pca.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_pca , y, test_size=0.2, random_state=42)

In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Encode the categorical target variable (y_train) using OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))

# Step 2: Flatten the encoded labels
y_train_encoded = y_train_encoded.argmax(axis=1)
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1)).argmax(axis=1)

In [None]:
class SVM:

    def __init__(self, C = 1.0):
        # C = error term
        self.C = C
        self.w = 0
        self.b = 0

    # Hinge Loss Function / Calculation
    def hingeloss(self, w, b, x, y):
        # Regularizer term
        reg = 0.5 * np.sum(w * w)

        y = y.reshape(-1, 1)

        # Calculate the optimization term for each data point
        opt_term = y * (np.dot(w, x.T) + b)


        # Calculate the loss for each data point
        loss = reg + self.C * np.sum(np.maximum(0, 1 - opt_term))

        return loss


    def fit(self, X, Y, batch_size=100, learning_rate=0.001, epochs=1000):
        # The number of features in X
        number_of_features = X.shape[1]

        # The number of Samples in X
        number_of_samples = X.shape[0]

        c = self.C

        # Creating ids from 0 to number_of_samples - 1
        ids = np.arange(number_of_samples)

        # Shuffling the samples randomly
        np.random.shuffle(ids)

        # creating an array of zeros
        w = np.zeros((1, number_of_features))
        b = 0
        losses = []

        # Gradient Descent 
        for i in range(epochs):
            # Calculating the Hinge Loss
            l = self.hingeloss(w, b, X, Y)

            # Appending all losses 
            losses.append(l)
            
            # Starting from 0 to the number of samples with batch_size as interval
            for batch_initial in range(0, number_of_samples, batch_size):
                gradw = 0
                gradb = 0

                for j in range(batch_initial, batch_initial+ batch_size):
                    if j < number_of_samples:
                        x = ids[j]
                        ti = Y[x] * (np.dot(w, X[x].T) + b)

                        if (ti > 1).all():
                            gradw += 0
                            gradb += 0
                        else:
                            # Calculating the gradients

                            #w.r.t w 
                            gradw += c * Y[x] * X[x]
                            # w.r.t b
                            gradb += c * Y[x]

                # Updating weights and bias
                w = w - learning_rate * w + learning_rate * gradw
                b = b + learning_rate * gradb
        
        self.w = w
        self.b = b

        return self.w, self.b, losses


    def predict(self, X):
        
        prediction = np.dot(X, self.w[0]) + self.b # w.x + b
        return np.sign(prediction)

In [9]:
print(X_pca.shape)

(74900, 5)


In [10]:

# Step 3: Train the SVM model with verbose output
svm_model = SVC(kernel='linear', C=1e3, probability=True, verbose=1)
print("here") 
svm_model.fit(X_train, y_train_encoded)

y_pred = svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print("Accuracy:", accuracy)

# Print classification report for detailed metrics
print(classification_report(y_test_encoded, y_pred))


here
[LibSVM]...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................