<a href="https://colab.research.google.com/github/Shahanas2003/svm_project/blob/main/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Load your data
data = pd.read_csv("emotions.csv")
X = data['text']
y = data['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Use only LinearSVC
model = LinearSVC(random_state=42, dual=False, max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = model.predict(X_test_tfidf)
print("LinearSVC Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))


LinearSVC Performance:
Accuracy: 0.8905
              precision    recall  f1-score   support

           0       0.94      0.93      0.93     24201
           1       0.91      0.92      0.91     28164
           2       0.78      0.77      0.78      6929
           3       0.90      0.90      0.90     11441
           4       0.84      0.84      0.84      9594
           5       0.72      0.70      0.71      3033

    accuracy                           0.89     83362
   macro avg       0.85      0.84      0.85     83362
weighted avg       0.89      0.89      0.89     83362



In [3]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
def predict_emotion(tweet):
    cleaned = preprocess_text(tweet)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    return prediction

In [5]:
example_tweet = "I'm so frustrated and angry right now!"
predicted_emotion = predict_emotion(example_tweet)
print("Tweet:", example_tweet)
print("Predicted Emotion:", predicted_emotion)

Tweet: I'm so frustrated and angry right now!
Predicted Emotion: 3


In [8]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split # Import train_test_split
import pandas as pd # Import pandas if not already imported in this cell

# Assuming df_subset is already created and preprocessed from previous steps
# Split the subset data into training and validation sets

# Create a subset of the data (e.g., 10000 rows)
data_subset = data.sample(n=10000, random_state=42).copy() # Added this line

X_subset = data_subset['text']
y_subset = data_subset['label']

# Split subset data into training and validation sets (e.g., 80% train, 20% validation)
X_train_small, X_val_small, y_train_small, y_val_small = train_test_split(X_subset, y_subset, test_size=0.2, random_state=42)

X_train_small_tfidf = vectorizer.transform(X_train_small)
X_val_small_tfidf = vectorizer.transform(X_val_small)

# Initialize SVM classifiers with different kernels

svm_poly = SVC(kernel='poly', max_iter=1000)
svm_rbf = SVC(kernel='rbf', max_iter=1000)
svm_sigmoid = SVC(kernel='sigmoid', max_iter=1000)

# Train the models using the vectorized small datasets

svm_poly.fit(X_train_small_tfidf, y_train_small)
svm_rbf.fit(X_train_small_tfidf, y_train_small)
svm_sigmoid.fit(X_train_small_tfidf, y_train_small)

# Evaluate the models using the vectorized validation set

y_pred_poly = svm_poly.predict(X_val_small_tfidf)
y_pred_rbf = svm_rbf.predict(X_val_small_tfidf)
y_pred_sigmoid = svm_sigmoid.predict(X_val_small_tfidf)

accuracy_poly = accuracy_score(y_val_small, y_pred_poly)
accuracy_rbf = accuracy_score(y_val_small, y_pred_rbf)
accuracy_sigmoid = accuracy_score(y_val_small, y_pred_sigmoid)


print(f"Accuracy (Polynomial): {accuracy_poly}")
print(f"Accuracy (RBF): {accuracy_rbf}")
print(f"Accuracy (Sigmoid): {accuracy_sigmoid}")



Accuracy (Polynomial): 0.7
Accuracy (RBF): 0.821
Accuracy (Sigmoid): 0.855
