In [24]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string


In [25]:
# Load the training data
training_data = pd.read_csv("train.csv")


In [26]:
# Basic data exploration
print("First few rows of the training data:")
print(training_data.head())

First few rows of the training data:
             tweet_id airline_sentiment    airline airline_sentiment_gold  \
0  567900433542488064          negative  Southwest                    NaN   
1  569989168903819264          positive  Southwest                    NaN   
2  568089179520954368          positive     United                    NaN   
3  568928195581513728          negative  Southwest                    NaN   
4  568594180014014464          negative     United                    NaN   

            name negativereason_gold  retweet_count  \
0  ColeyGirouard                 NaN              0   
1  WalterFaddoul                 NaN              0   
2      LocalKyle                 NaN              0   
3    amccarthy19                 NaN              0   
4        J_Okayy                 NaN              0   

                                                text tweet_coord  \
0  @SouthwestAir I am scheduled for the morning, ...         NaN   
1  @SouthwestAir seeing your work

In [27]:
print("Information about the training data:")
print(training_data.info())



Information about the training data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10980 entries, 0 to 10979
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   tweet_id                10980 non-null  int64 
 1   airline_sentiment       10980 non-null  object
 2   airline                 10980 non-null  object
 3   airline_sentiment_gold  31 non-null     object
 4   name                    10980 non-null  object
 5   negativereason_gold     24 non-null     object
 6   retweet_count           10980 non-null  int64 
 7   text                    10980 non-null  object
 8   tweet_coord             776 non-null    object
 9   tweet_created           10980 non-null  object
 10  tweet_location          7430 non-null   object
 11  user_timezone           7403 non-null   object
dtypes: int64(2), object(10)
memory usage: 1.0+ MB
None


In [28]:
# # Data preprocessing
# stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()

# def preprocess_text(text):
#     # Convert to lowercase
#     text = text.lower()
#     # Remove punctuation
#     text = text.translate(str.maketrans('', '', string.punctuation))
#     # Tokenization
#     tokens = nltk.word_tokenize(text)
#     # Remove stopwords
#     tokens = [word for word in tokens if word not in stop_words]
#     # Lemmatization
#     tokens = [lemmatizer.lemmatize(word) for word in tokens]
#     # Join tokens back into text
#     return ' '.join(tokens)

# training_data['text'] = training_data['text'].apply(preprocess_text)
# print(training_data['text'])

In [29]:
# Split the data into features and target
X = training_data['text']
y = training_data['airline_sentiment']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, max_df=0.8, min_df=0.001)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)


In [31]:
# Model building and training
clf = RandomForestClassifier(n_estimators=4000, n_jobs=-1)
clf.fit(X_train_tfidf, y_train)


In [32]:
# Make predictions on the validation set
y_pred = clf.predict(X_val_tfidf)


In [33]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print("\nAccuracy on validation set:", accuracy)


Accuracy on validation set: 0.7550091074681239
