In [1]:
import re

def preprocess_text(text):
    # Remove any non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    return text


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

# Load the data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Preprocess the data
df_train['text'] = df_train['text'].str.replace('[^\w\s]','') # remove punctuation
df_test['text'] = df_test['text'].str.replace('[^\w\s]','') # remove punctuation

# Split the data into training and validation sets
train_data, val_data = train_test_split(df_train, test_size=0.2)

# Vectorize the data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['text'])
X_val = vectorizer.transform(val_data['text'])
X_test = vectorizer.transform(df_test['text'])

y_train = train_data['label']
y_val = val_data['label']

# Train the model
model = LinearSVC()
model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_val)

# Evaluate the performance of the model
f1 = f1_score(y_val, y_val_pred, average='macro')
print('F1 score:', f1)

# Make predictions on the test set and save the results
y_test_pred = model.predict(X_test)
df_submission = pd.DataFrame({'id': df_test['id'], 'label': y_test_pred})
df_submission.to_csv('submission.csv', index=False)


  df_train['text'] = df_train['text'].str.replace('[^\w\s]','') # remove punctuation
  df_test['text'] = df_test['text'].str.replace('[^\w\s]','') # remove punctuation


F1 score: 0.7428007610640801


In [None]:
# Preprocess the text data
    train_data.loc[:,'text'] = train_data['text'].apply(preprocess_text)
    val_data.loc[:,'text'] = val_data['text'].apply(preprocess_text)
