In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score

# Load the training data
train_df = pd.read_csv('train.csv')

# Define stop words
stop_words = set(stopwords.words('english'))

# Preprocess the text data
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Tokenize the words
    words = word_tokenize(text)
    
    # Remove stop words
    filtered_words = [word for word in words if word not in stop_words]
    
    # Join the words back into a string
    text = ' '.join(filtered_words)
    
    return text

train_df['text'] = train_df['text'].apply(preprocess_text)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['text'], train_df['label'], test_size=0.2, random_state=42)

# Vectorize the text data using the TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train).astype('float64')
X_val = vectorizer.transform(X_val).astype('float64')

# Define the SVM model
svm = SVC()

# Define the hyperparameters to tune
param_dist = {'C': np.logspace(-3, 3, 7), 'kernel': ['linear', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto']}

# Use random search to tune the hyperparameters
random_search = RandomizedSearchCV(svm, param_distributions=param_dist, cv=5, n_iter=50, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print('Best hyperparameters:', random_search.best_params_)

# Train the SVM model with the best hyperparameters
svm = SVC(**random_search.best_params_)
svm.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = svm.predict(X_val)

# Evaluate the performance of the model using F1 score
f1 = f1_score(y_val, y_pred, average='weighted')
print('F1 score:', f1)

# Preprocess the test data
test_df = pd.read_csv('test.csv')
test_df['text'] = test_df['text'].apply(preprocess_text)
X_test = vectorizer.transform(test_df['text']).astype('float64')

# Define an ensemble of models
ensemble = VotingClassifier([('svm', svm), ('nb', MultinomialNB())])

# Train the ensemble on the full training set
X_full = vectorizer.fit_transform(train_df['text']).astype('float64')
y_full = train_df['label']
ensemble.fit(X_full, y_full)

# Make predictions on the test data using the ensemble
y_test_pred = ensemble.predict(X_test)

# Create a submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'label': y_test_pred})
submission_df.to_csv('submission.csv', index=False)


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\kkksk\AppData\Local\Temp\ipykernel_18544\851325097.py", line 1, in <module>
    import pandas as pd
  File "C:\Anaconda\lib\site-packages\pandas\__init__.py", line 11, in <module>
    __import__(dependency)
  File "C:\Anaconda\lib\site-packages\numpy\__init__.py", line 138, in <module>
    from ._version import get_versions
  File "<frozen importlib._bootstrap>", line 1007, in _find_and_load
  File "<frozen importlib._bootstrap>", line 986, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 680, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 846, in exec_module
  File "<frozen importlib._bootstrap_external>", line 941, in get_code
  File "<frozen importlib._bootstrap_external>", line 1039, in get_data
KeyboardInterrupt

During handling of the

TypeError: object of type 'NoneType' has no len()