In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Data Loading
df = pd.read_csv('trainfile.csv')

# 2. Data Cleaning and Advanced Preprocessing
def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'\d+', '', text)  # Removing digits
    text = re.sub(r'[^\w\s]', '', text)  # Removing punctuation
    tokens = word_tokenize(text)  # Tokenization
    tokens = [WordNetLemmatizer().lemmatize(word) for word in tokens]  # Lemmatization
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Removing stopwords
    return ' '.join(tokens)

df['text'] = df['text'].apply(clean_text)

# 3. Text Vectorization with N-grams
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X = tfidf_vectorizer.fit_transform(df['text'])

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['task1'])

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# 5. Model Building and Hyperparameter Tuning

# Support Vector Machine (SVM)
svm_model = SVC()
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
svm_grid_search = GridSearchCV(svm_model, svm_param_grid, cv=5, scoring='accuracy')
svm_grid_search.fit(X_train, y_train)

print("Best SVM Parameters:")
print(svm_grid_search.best_params_)

# Gradient Boosting Classifier
gbm_model = GradientBoostingClassifier()
gbm_param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}
gbm_grid_search = GridSearchCV(gbm_model, gbm_param_grid, cv=5, scoring='accuracy')
gbm_grid_search.fit(X_train, y_train)

print("Best GBM Parameters:")
print(gbm_grid_search.best_params_)

# Select the best model (for demonstration, using SVM)
best_model = svm_grid_search.best_estimator_

# 6. Evaluation
y_pred = best_model.predict(X_test)
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# 7. Custom Input Function
def predict_custom_input(text):
    # Clean and preprocess the input text
    cleaned_text = clean_text(text)
    # Vectorize the text using the trained TF-IDF vectorizer
    vectorized_text = tfidf_vectorizer.transform([cleaned_text])
    # Predict the label using the trained model
    prediction = best_model.predict(vectorized_text)
    # Decode the label to the original category
    decoded_prediction = label_encoder.inverse_transform(prediction)
    return decoded_prediction[0]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Best SVM Parameters:
{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best GBM Parameters:
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

Accuracy Score:
0.7437070938215103

Classification Report:
              precision    recall  f1-score   support

  non-sexist       0.73      0.72      0.72       205
      sexist       0.75      0.77      0.76       232

    accuracy                           0.74       437
   macro avg       0.74      0.74      0.74       437
weighted avg       0.74      0.74      0.74       437



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
custom_text = "womens makes man happy"
predicted_label = predict_custom_input(custom_text)
print(f"Predicted label for the custom input: {predicted_label}")

Predicted label for the custom input: sexist
