In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load your training dataset
train_df = pd.read_csv("data_clean.csv")
test_df = pd.read_csv("test_clean.csv")

columns_to_drop = ['tweet_chars', 'tweet_words', 'hashtag_chars', 'hashtag_words', 'text_clean_chars', 'text_clean_words']
train_df = train_df.drop(columns=columns_to_drop)

# Preprocessing
train_df['text_clean'].fillna('', inplace=True)
train_df['country_user'].fillna('Unknown', inplace=True)
train_df['gender_user'].fillna('Unknown', inplace=True)
train_df['sentiment_score'].fillna(0, inplace=True)  # Replace NaN with a default value
train_df['hashtags'].fillna('', inplace=True)  # Replace NaN with empty strings
# Preprocessing (Make sure to handle new categories as discussed above)
test_df['text_clean'].fillna('', inplace=True)
test_df['country_user'].fillna('Unknown', inplace=True)
test_df['gender_user'].fillna('Unknown', inplace=True)
test_df['sentiment_score'].fillna(0, inplace=True)
test_df['hashtags'].fillna('', inplace=True)

# TFIDF Vectorization for 'text_clean'
tfidf_vectorizer = TfidfVectorizer(max_features=13080)
X_tfidf = tfidf_vectorizer.fit_transform(train_df['text_clean'].values.astype('U'))
X_tfidf_test = tfidf_vectorizer.transform(test_df['text_clean'].values.astype('U'))

# Vectorization for 'hashtags'
# You can choose between TfidfVectorizer and CountVectorizer
hashtag_vectorizer = TfidfVectorizer()
X_hashtags = hashtag_vectorizer.fit_transform(train_df['hashtags'].values.astype('U'))

# One-hot Encoding for 'country_user' and 'gender_user'
onehot_encoder = OneHotEncoder()
X_country = onehot_encoder.fit_transform(train_df[['country_user']])
X_country_test = onehot_encoder.fit_transform(test_df[['country_user']])
X_gender = onehot_encoder.fit_transform(train_df[['gender_user']])
X_gender_test = onehot_encoder.transform(test_df[['gender_user']])


# Apply weighting to the country feature
country_weight = 5  # This is the weighting factor for the country feature
X_country_weighted = X_country.multiply(country_weight)
gender_weight = 2
X_gender_weighted = X_gender.multiply(gender_weight)


# Normalize or Standardize the sentiment score
scaler = StandardScaler()
X_sentiment = scaler.fit_transform(train_df[['sentiment_score']])

# Combine all features 
X_combined = hstack([X_tfidf, X_country_weighted, X_gender_weighted, csr_matrix(X_sentiment), X_hashtags])

# Encode the target variable
y = train_df['pol_spec_user'].astype('category').cat.codes
target_labels = train_df['pol_spec_user'].astype('category').cat.categories

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)



In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=150, max_depth=4000, min_samples_split=2, random_state=42, n_jobs=-1)
rf_classifier.fit(X_combined, y)

# Train the classifier on the training set
# rf_classifier.fit(X_train, y_train)

# # Predict on the test set
# y_pred_rf = rf_classifier.predict(X_test)

# # Evaluate the classifier
# accuracy_rf = accuracy_score(y_test, y_pred_rf)
# classification_rep_rf = classification_report(y_test, y_pred_rf)

In [14]:
print(classification_rep_rf)

              precision    recall  f1-score   support

           0       0.77      0.62      0.69     21099
           1       0.77      0.58      0.66       153
           2       0.72      0.83      0.77     34943
           3       0.74      0.70      0.72     25250

    accuracy                           0.74     81445
   macro avg       0.75      0.68      0.71     81445
weighted avg       0.74      0.74      0.73     81445



In [10]:


# Apply weighting to the transformed features
X_country_weighted_test = X_country_test.multiply(country_weight)
X_gender_weighted_test = X_gender_test.multiply(gender_weight)

# Apply weighting and standardize sentiment scores
X_country_weighted_test = X_country_test.multiply(country_weight)
X_sentiment_test = scaler.transform(test_df[['sentiment_score']])
X_hashtags_test = hashtag_vectorizer.transform(test_df['hashtags'].values.astype('U'))

# Combine all features for the test set
X_combined_test = hstack([X_tfidf_test, X_country_weighted_test, X_gender_weighted_test, csr_matrix(X_sentiment_test), X_hashtags_test])


In [11]:
y_pred_rf = rf_classifier.predict(X_combined_test)

# Load the submission file
submission = pd.read_csv('submission_north_europe.csv')

predicted_labels = target_labels[y_pred_rf]
# Assign the predicted labels to the appropriate column in the submission file
submission['pol_spec_user'] = predicted_labels

# Save the submission file
submission.to_csv('submission74.csv', index=False)
