In [16]:
# Import necessary libraries
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

# Load the datasets
train_path = 'train.csv'
test_path = 'test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Data Cleaning and Preprocessing
def preprocess_text(text):
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text using a simple split
    tokens = text.split()
    # Apply simple stemming using word stripping (basic approach)
    tokens = [word[:-1] if word.endswith('s') else word for word in tokens]
    # Remove stopwords (simple example)
    stop_words = set(['the', 'is', 'in', 'and', 'to', 'a', 'of'])  # Add more as needed
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to training and testing data
train_data['cleaned_text'] = train_data['text'].apply(preprocess_text)
test_data['cleaned_text'] = test_data['text'].apply(preprocess_text)

# Feature Extraction
vectorizer = CountVectorizer(max_features=1500)
X_train = vectorizer.fit_transform(train_data['cleaned_text']).toarray()
y_train = train_data['target']

X_test = vectorizer.transform(test_data['cleaned_text']).toarray()

# Model Training
model = GaussianNB()
model.fit(X_train, y_train)

# Prediction on Test Data
y_pred = model.predict(X_test)

# Add predictions to the test data
test_data['predicted'] = y_pred

# Save the updated test data with predictions to a CSV file
output_path = 'test_with_predictions.csv'
test_data.to_csv(output_path, index=False)

print("Test data with predictions saved to 'test_with_predictions.csv'")

# Print the contents of the updated test data
print("\nContents of the test_with_predictions.csv file:")
print(test_data.head())


Test data with predictions saved to 'test_with_predictions.csv'

Contents of the test_with_predictions.csv file:
   id keyword location  \
0   0     NaN      NaN   
1   2     NaN      NaN   
2   3     NaN      NaN   
3   9     NaN      NaN   
4  11     NaN      NaN   

                                                                                               text  \
0                                                                Just happened a terrible car crash   
1                                  Heard about #earthquake is different cities, stay safe everyone.   
2  there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all   
3                                                          Apocalypse lighting. #Spokane #wildfires   
4                                                     Typhoon Soudelor kills 28 in China and Taiwan   

                                                                             cleaned_text  \
0                 