In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = 'tweets_country_region.csv'  # Replace with your file path
tweets_data = pd.read_csv(file_path)

# Step 1: Preprocessing
# Combine `keyword` and `text` into a single feature
tweets_data['combined_text'] = tweets_data['keyword'].fillna('') + ' ' + tweets_data['text']

# Encode categorical variables if necessary (e.g., `country` or `region`)
encoder = LabelEncoder()
tweets_data['country_encoded'] = encoder.fit_transform(tweets_data['country'].fillna('Unknown'))

# Step 2: Feature Selection
# Define features and target
X_text = tweets_data['combined_text']
X_country = tweets_data['country_encoded']
y = tweets_data['target']

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_text_vectorized = vectorizer.fit_transform(X_text)

# Combine text vectorization with other features (if needed)
import scipy.sparse as sp
X_combined = sp.hstack([X_text_vectorized, X_country.values.reshape(-1, 1)])

# Step 3: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Step 4: Logistic Regression
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Step 5: Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))