In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv('/Users/bochzhang/Downloads/tweets_country_region.csv')

# Handle NaN values before splitting
df['text'] = df['text'].fillna('')  

# Create mask for rows with non-null regions
valid_regions = df['region'].notna()

# Filter data to use only rows with valid regions for training
X = df[valid_regions]['text']
y = df[valid_regions]['region']

# Prepare features using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Logistic Regression
lr = LogisticRegression(multi_class='multinomial', max_iter=1000)
lr.fit(X_train, y_train)

# Make predictions
y_pred = lr.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred, target_names=['Africa', 'Americas', 'Asia', 'Europe', 'Oceania']))

# Predict missing regions
missing_mask = df['region'].isna()
missing_X = tfidf.transform(df[missing_mask]['text'])
predictions = lr.predict(missing_X)

# Update missing regions
df.loc[missing_mask, 'region'] = predictions


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('/Users/bochzhang/Downloads/tweets_country_region.csv')

# Simple preprocessing
df['text'] = df['text'].fillna('')  # Handle missing text
valid_mask = df['region'].notna()   # Only use rows with regions

# Prepare features - simple TF-IDF
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2)  # Include bigrams
)

# Prepare data
X = tfidf.fit_transform(df[valid_mask]['text'])
y = df[valid_mask]['region']

# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Train model with class weights

class_weights = {
    'Americas': 1.0,
    'Europe': 1.0,    # Increased due to low precision
    'Asia': 1.0,      # Lower weight due to good precision
    'Africa': 1.2,    # Higher weight to balance precision/recall
    'Oceania': 1.0    # Moderate weight due to decent precision
}


lr = LogisticRegression(
    multi_class='multinomial',
    class_weight=class_weights,
    max_iter=1000
)

# Fit and predict
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Print results
print(classification_report(y_test, y_pred))

