In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

# Load the training data
df_train = pd.read_csv('train.csv')

# Preprocess the text data
def preprocess_text(text):
    # Remove unnecessary characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

df_train['text'] = df_train['text'].apply(preprocess_text)

# Split the data into training and validation sets
train_data, val_data = train_test_split(df_train, test_size=0.2)

# Create a CountVectorizer to convert text to a bag of words
vectorizer = CountVectorizer()

# Create an SVM model with default hyperparameters
svm_model = LinearSVC(max_iter = 50000)

# Create a pipeline to combine the vectorizer and the model
pipeline = Pipeline([('vectorizer', vectorizer), ('svm', svm_model)])

# Define the hyperparameters to search over
params = {'svm__C': [0.1, 1], 'svm__max_iter': [100, 1000]}

# Perform a grid search over the hyperparameters using 5-fold cross-validation
grid_search = GridSearchCV(pipeline, params, cv=5, scoring='f1_macro')
grid_search.fit(train_data['text'], train_data['label'])

# Print the best hyperparameters found
print('Best hyperparameters:', grid_search.best_params_)

# Evaluate the performance of the model on the validation set
y_true = val_data['label']
y_pred = grid_search.predict(val_data['text'])
f1_score = metrics.f1_score(y_true, y_pred, average='macro')
print('Validation F1 score:', f1_score)

# Load the test data
df_test = pd.read_csv('test.csv')

# Preprocess the text data
df_test['text'] = df_test['text'].apply(preprocess_text)

# Make predictions on the test data using the best model
y_pred = grid_search.predict(df_test['text'])

# Save the predictions to a CSV file
df_submission = pd.DataFrame({'id': df_test['id'], 'label': y_pred})
df_submission.to_csv('submission.csv', index=False)




Best hyperparameters: {'svm__C': 0.1, 'svm__max_iter': 1000}


NameError: name 'metrics' is not defined