In [3]:
pip install pandas numpy scikit-learn nltk wordcloud matplotlib seaborn


Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load Dataset
train = pd.read_csv(r"C:\Users\Vishwa\Desktop\GUVI captone\twitter\ds\train.csv")
test = pd.read_csv(r"C:\Users\Vishwa\Desktop\GUVI captone\twitter\ds\test.csv")

# Text Cleaning Function
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)  # Remove single characters at start
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces
    text = text.lower()  # Convert to lowercase
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

# Apply text cleaning
train['cleaned_tweet'] = train['tweet'].apply(clean_text)
test['cleaned_tweet'] = test['tweet'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vishwa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)  # Use top 5000 words
X_train = tfidf.fit_transform(train['cleaned_tweet'])
X_test = tfidf.transform(test['cleaned_tweet'])

y_train = train['label']


In [6]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train SVM Model
svm_model = SVC(kernel='linear', C=1.0)  # Linear Kernel
svm_model.fit(X_train, y_train)

# Predictions
y_pred = svm_model.predict(X_train)

# Evaluate Model Performance
print("SVM Model Performance:")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Precision:", precision_score(y_train, y_pred))
print("Recall:", recall_score(y_train, y_pred))
print("F1-Score:", f1_score(y_train, y_pred))


SVM Model Performance:
Accuracy: 0.9701207684124898
Precision: 0.949685534591195
Recall: 0.6061552185548618
F1-Score: 0.7399945548597876


In [7]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train SVM Model
svm_model = SVC(kernel='linear', C=1.0)  # Linear Kernel
svm_model.fit(X_train, y_train)

# Predictions
y_pred = svm_model.predict(X_train)

# Evaluate Model Performance
print("SVM Model Performance:")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Precision:", precision_score(y_train, y_pred))
print("Recall:", recall_score(y_train, y_pred))
print("F1-Score:", f1_score(y_train, y_pred))


SVM Model Performance:
Accuracy: 0.9701207684124898
Precision: 0.949685534591195
Recall: 0.6061552185548618
F1-Score: 0.7399945548597876


In [8]:
test['prediction'] = svm_model.predict(X_test)

# Save Predictions
test[['id', 'prediction']].to_csv("svm_test_predictions.csv", index=False)
print("Predictions saved successfully in svm_test_predictions.csv!")


Predictions saved successfully in svm_test_predictions.csv!


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define SVM hyperparameters to tune
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'gamma': ['scale', 'auto']  # Kernel coefficient (only for 'rbf' and 'poly')
}

# Perform Grid Search
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best Parameters
print("Best Parameters:", grid_search.best_params_)

# Train the best model
best_svm = grid_search.best_estimator_


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict using the best model
y_pred_best = best_svm.predict(X_train)

# Evaluate Model Performance
print("\nOptimized SVM Model Performance:")
print("Accuracy:", accuracy_score(y_train, y_pred_best))
print("Precision:", precision_score(y_train, y_pred_best))
print("Recall:", recall_score(y_train, y_pred_best))
print("F1-Score:", f1_score(y_train, y_pred_best))



Optimized SVM Model Performance:
Accuracy: 0.999311682623115
Precision: 0.9991007194244604
Recall: 0.9910793933987511
F1-Score: 0.9950738916256158


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Train Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_train)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_train)

# Compare Performance
models = {
    "SVM": best_svm,
    "Logistic Regression": lr_model,
    "Random Forest": rf_model
}

for name, model in models.items():
    y_pred = model.predict(X_train)
    print(f"\n{name} Performance:")
    print("Accuracy:", accuracy_score(y_train, y_pred))
    print("Precision:", precision_score(y_train, y_pred))
    print("Recall:", recall_score(y_train, y_pred))
    print("F1-Score:", f1_score(y_train, y_pred))



SVM Performance:
Accuracy: 0.999311682623115
Precision: 0.9991007194244604
Recall: 0.9910793933987511
F1-Score: 0.9950738916256158

Logistic Regression Performance:
Accuracy: 0.9580126400100119
Precision: 0.9420432220039293
Recall: 0.427743086529884
F1-Score: 0.5883435582822085

Random Forest Performance:
Accuracy: 0.9993742569301045
Precision: 0.9995503597122302
Recall: 0.9915254237288136
F1-Score: 0.9955217196596506


In [12]:
pip install joblib streamlit pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [13]:

import joblib

# Save the model
joblib.dump(best_svm, "svm_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")  # Save TF-IDF Vectorizer

print("SVM Model and Vectorizer saved successfully!")


SVM Model and Vectorizer saved successfully!


In [42]:
print(train['label'].value_counts())


label
0    29720
1     2242
Name: count, dtype: int64


In [44]:
from imblearn.over_sampling import SMOTE
import pandas as pd

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_resampled).value_counts())


Before SMOTE: label
0    29720
1     2242
Name: count, dtype: int64
After SMOTE: label
0    29720
1    29720
Name: count, dtype: int64


In [46]:
print(clean_text("I hate this group of people!"))


hate group people


In [48]:
if "hate" in stop_words:
    stop_words.remove("hate")


In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))  # Use bigrams (word pairs)
X_train = tfidf.fit_transform(train['cleaned_tweet'])
X_test = tfidf.transform(test['cleaned_tweet'])


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10], 
    'kernel': ['linear', 'rbf'], 
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_svm = grid_search.best_estimator_


In [None]:
y_pred = best_svm.predict(tfidf.transform(["I hate this group of people!"]))
print("Prediction:", y_pred)  # Expecting 1 for hate speech


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import joblib

# Load dataset
df = pd.read_csv(r"C:\Users\Vishwa\Desktop\GUVI captone\twitter\ds\train.csv")  # Update path

# Class distribution check
print(df["label"].value_counts())

# Balance the dataset (Oversampling Minority Class)
from sklearn.utils import resample
df_majority = df[df.label == 0]  # Non-hate speech
df_minority = df[df.label == 1]  # Hate speech

df_minority_upsampled = resample(df_minority, 
                                 replace=True,  # Upsample minority class
                                 n_samples=len(df_majority),  
                                 random_state=42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Shuffle dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
print(df_balanced["label"].value_counts())  # Check balance

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df_balanced["tweet"], df_balanced["label"], test_size=0.2, random_state=42)


label
0    29720
1     2242
Name: count, dtype: int64
label
0    29720
1    29720
Name: count, dtype: int64


In [3]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [5]:
# Train SVM model
svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(X_train_tfidf, y_train)

# Test Accuracy
print(f"Model Accuracy: {svm_model.score(X_test_tfidf, y_test) * 100:.2f}%")


Model Accuracy: 96.93%


In [7]:
# Save the trained model
joblib.dump(svm_model, "svm_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
print("Model & Vectorizer Saved Successfully!")


Model & Vectorizer Saved Successfully!


In [9]:
import joblib

# Load the new model
svm_model = joblib.load("svm_model.pkl")
tfidf = joblib.load("tfidf_vectorizer.pkl")

# Test input
test_text = "hate worst bad"
transformed_text = tfidf.transform([test_text])
prediction = svm_model.predict(transformed_text)[0]

print(f"Model Prediction: {prediction}")  # Expected: 1 (Hate Speech)


Model Prediction: 1
