In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/Shoyeb45/DataScience/refs/heads/main/Dataset/data-week-2/sentiment140_1000.csv", sep = "\t")

In [4]:
df.head(),  df.shape

(                                               tweet  sentiment
 0  @switchfoot http://twitpic.com/2y1zl - Awww, t...          0
 1  is upset that he can't update his Facebook by ...          0
 2  @Kenichan I dived many times for the ball. Man...          0
 3    my whole body feels itchy and like its on fire           0
 4  @nationwideclass no, it's not behaving at all....          0,
 (1000, 2))

In [5]:
df["sentiment"].value_counts()

sentiment
0    500
4    500
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder  

encode = LabelEncoder()
df["class"] = encode.fit_transform(df["sentiment"])

In [7]:
print(df.head())
print(f"Original classes : {encode.classes_}")
print(f"Label classes : {df["class"].unique()}")


                                               tweet  sentiment  class
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...          0      0
1  is upset that he can't update his Facebook by ...          0      0
2  @Kenichan I dived many times for the ball. Man...          0      0
3    my whole body feels itchy and like its on fire           0      0
4  @nationwideclass no, it's not behaving at all....          0      0
Original classes : [0 4]
Label classes : [0 1]


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert the tweet into vectors
# Convert sentences into feature space using N-grams
ngram_range = (1, 2) # Unigrams and bigrams 
vectorizer = CountVectorizer(ngram_range = ngram_range, max_features = 1000)

X = vectorizer.fit_transform(df["tweet"])


In [9]:
# Build model
# Import
from sklearn.svm import SVC

model_svm = SVC(kernel = "linear", C = 1.0)

In [10]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# Perform K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-Fold Cross-Validation
cv_accuracies = []
precision_list = []
recall_list = []
fscore_list = []

print("Performing K-Fold Cross-Validation...")
for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
    # Split data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = df['class'][train_index], df['class'][test_index]

    # Train the model
    model_svm.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model_svm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

    # Store results
    cv_accuracies.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fscore)

    print(f"Fold {fold}:")
    print(f"  Accuracy: {accuracy:.2f}")
    print(f"  Precision: {precision:.2f}, Recall: {recall:.2f}, F-score: {fscore:.2f}")

Performing K-Fold Cross-Validation...
Fold 1:
  Accuracy: 0.64
  Precision: 0.64, Recall: 0.64, F-score: 0.64
Fold 2:
  Accuracy: 0.56
  Precision: 0.57, Recall: 0.56, F-score: 0.56
Fold 3:
  Accuracy: 0.64
  Precision: 0.64, Recall: 0.64, F-score: 0.64
Fold 4:
  Accuracy: 0.65
  Precision: 0.65, Recall: 0.65, F-score: 0.65
Fold 5:
  Accuracy: 0.66
  Precision: 0.66, Recall: 0.66, F-score: 0.66


In [11]:
# Final evaluation
mean_accuracy = np.mean(cv_accuracies)
std_accuracy = np.std(cv_accuracies)
mean_precision = np.mean(precision_list)
mean_recall = np.mean(recall_list)
mean_fscore = np.mean(fscore_list)

print("\nCross-Validation Results:")
print(f"Mean Accuracy: {mean_accuracy:.2f} (±{std_accuracy:.2f})")
print(f"Mean Precision: {mean_precision:.2f}")
print(f"Mean Recall: {mean_recall:.2f}")
print(f"Mean F-score: {mean_fscore:.2f}")


Cross-Validation Results:
Mean Accuracy: 0.63 (±0.04)
Mean Precision: 0.63
Mean Recall: 0.63
Mean F-score: 0.63
