# Import Required Libraries
Import the necessary libraries, including pandas, seaborn, and matplotlib.pyplot.

In [31]:
# Import Required Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date
from datetime import datetime
import holidays
import numpy as np
import demoji

In [32]:
df = pd.read_csv('merged_final.csv')


In [None]:
import pandas as pd
import numpy as np
import holidays

# Define Turkish holidays
tr_holidays = holidays.Turkey()

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Apply the holiday check
df['is_holiday'] = df['timestamp'].apply(lambda x: x in tr_holidays)

# Extract additional date features
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month
df['year'] = df['timestamp'].dt.year
df['hour'] = df['timestamp'].dt.hour
# Sort DataFrame
df = df.sort_values(['username', 'timestamp'], ascending=[True, True])

# Calculate mean post count

df['comments_count'].fillna(0, inplace=True)
#  replace 0's with 1s
df['comments_count'] = df['comments_count'].replace(0, 1)
df['like_count'] = df['like_count'].replace(0, 1)
df['like_count'].fillna(1, inplace=True)
# Handle categorical data
df['category_name'] = df['category_name'].fillna('Unknown')
print(df['category_name'].value_counts())
df['category_name'] = df['category_name'].astype('category').cat.codes
df['follower_count'] = df['follower_count']
# Handle hashtags, emojis, and tags
df['hashtags_x'] = df['hashtags_x'].fillna(' ')
df['tags'] = df['tags'].fillna(' ')
df['hashtags_present_in_caption'] = df['hashtags_x'].apply(lambda x: 1 if len(x.split('#')) > 1 else 0)
df['tags_present_in_caption'] = df['tags'].apply(lambda x: 1 if len(x.split('@')) > 1 else 0)
df['count_caption'] = df['caption'].apply(lambda x: len(x.split(' ')))
mean_post_count = df['post_count'].mean()
df['post_count'] = df['post_count'].fillna(mean_post_count)
# Drop unnecessary columns
df = df.drop(['full_name'], axis=1)

# Convert boolean columns to int
df[df.select_dtypes(['bool']).columns] = df[df.select_dtypes(['bool']).columns].astype(int)
df = df[df['hide_like_and_view_counts'] == 0]
df = df.drop('hide_like_and_view_counts', axis=1)

# Drop old separate media type columns if no longer needed
df.drop(['media_type_IMAGE', 'media_type_CAROUSEL_ALBUM'], axis=1, inplace=True)

# Sort columns by name
df = df.reindex(sorted(df.columns), axis=1)

# Save to CSV
df.to_csv('numeric_data.csv', index=False)


In [None]:
import numpy as np
import pandas as pd
import re

# Refined keyword list
lottery_keywords = [
    r"\bçekiliş(?:e|i|ler|ten|ler)?\b",  # Variations of çekiliş
    r"\bhediye(?:ler|m)?\b",  # Variations of hediye
    r"\bkatıl(?:mak|dım|ıyor)?\b",  # Variations of katıl
    r"\bbol şans\b",  # Exact match for "bol şans"
    r"\bödül(?:ler)?\b",  # Variations of ödül
    r"\bkazanan(?:lar|ı)?\b",  # Variations of kazanan
    r"\bsonuç(?:lar)?\b",  # Variations of sonuç
    r"\bkazan(?:mak|dı|ıyor)?\b",  # Variations of kazan
    r"\betiket(?:lemek|le)?\b",  # Variations of etiket and etiketlemek
]

# Add "yorum" as a separate condition
mandatory_keyword = r"\byorum(?: bırak| yaz|lara|la)?\b"  # Variations of yorum actions

# Create combined pattern
lottery_pattern = f"(?=.*{mandatory_keyword})(?=.*({'|'.join(lottery_keywords)}))"

# Apply the pattern to cleaned_caption
df['is_lottery'] = df['cleaned_caption'].str.contains(lottery_pattern, case=False, na=False)

# Function to replace high-quartile lottery comments
def adjust_comments(group):
    # Calculate Q3 for comments_count
    Q3 = np.percentile(group['comments_count'], 75)
    
    # Identify high-quartile and lottery posts
    high_quartile = group['comments_count'] > Q3
    lottery_posts = group['is_lottery']
    
    # Calculate mean of non-lottery posts for replacement
    mean_non_lottery = group.loc[~lottery_posts, 'comments_count'].mean()
    
    # Replace high-quartile lottery comments with mean_non_lottery
    group.loc[high_quartile & lottery_posts, 'comments_count'] = mean_non_lottery
    
    return group

# Group by user and adjust comments
df = df.groupby('username').apply(adjust_comments)
df.to_csv('lottery_fixed.csv', index=False)

In [37]:
# Load the data
df = pd.read_csv('lottery_fixed.csv')

# List to store the usernames to drop
users_to_drop = []
#  drop users with fewer than 2 posts
for username, group in df.groupby('username'):
    if len(group) < 3:
        users_to_drop.append(username)
# Drop the users with fewer than 3 posts
df = df[~df['username'].isin(users_to_drop)]


In [None]:
import pandas as pd
import numpy as np

# Assuming the necessary functions (e.g., calculate_all_stats, create_t_features, etc.) are defined earlier

# Function to add lagged features to the original dataset
def add_lagged_features_to_dataset(df):
    lagged_data = []
    
    # Iterate over each row in the dataset
    for idx, row in df.iterrows():
        # Get the user's group (all posts by the same user)
        user_group = df[df['username'] == row['username']]

        # Get all posts that are before the current post (this ensures the lags are valid)
        lags = user_group[user_group['timestamp'] < row['timestamp']]

        # Skip this row if there are no previous posts (lags)
        if len(lags) == 0:
            continue  # Skip to the next row

        # Lagged features calculation for valid rows with lags
        lagged_features = calculate_all_stats(lags, row)  # Lagged statistics
        media_type_stats = calculate_media_type_stats(lags, row)  # Media type statistics
        t_features = create_t_features(lags)  # Temporal features

        # Combine the lagged features with the current row
        modified_row = {
            **row.to_dict(),  # Add the original row values
            **lagged_features,  # Add lagged statistics
            **media_type_stats,  # Add media type statistics
            **t_features  # Add temporal features
        }

        # Append the modified row to the list
        lagged_data.append(modified_row)
    
    # Convert the lagged data list into a DataFrame and return
    return pd.DataFrame(lagged_data)


# Function to apply outlier handling and generate lagged features (from your original code)
def handle_outliers(lags, username, multiplier=100.0, column='like_count'):
    if lags[column].max() > 100000 or (column == 'comments_count' and lags[column].max() > 1000):
        q1 = lags[column].quantile(0.1)
        q3 = lags[column].quantile(0.9)
        iqr = q3 - q1

        if iqr == 0:
            lower_bound = lags[column].min() - 1
            upper_bound = lags[column].max() + 1
        else:
            lower_bound = max(q1 - (multiplier * iqr), 0)
            upper_bound = q3 + (multiplier * iqr)

        non_outlier_mask = (lags[column] >= lower_bound) & (lags[column] <= upper_bound)
        non_outlier_values = lags[column][non_outlier_mask]

        if not non_outlier_values.empty:
            mean_value = int(round(non_outlier_values.mean()))
            max_value = non_outlier_values.max()
            min_value = non_outlier_values.min()

            if (non_outlier_values.nunique() == 1):
                replacement_value = non_outlier_values.iloc[0]
            else:
                replacement_value = max_value
        else:
            mean_value = 0
            replacement_value = 0

        lags.loc[lags[column] > upper_bound, column] = max_value if (non_outlier_values.nunique() > 1) else replacement_value
        lags.loc[lags[column] < lower_bound, column] = min_value if (non_outlier_values.nunique() > 1) else replacement_value

        return lags
    else:
        return lags


def separate_target_and_lags(group):
    group = group.sort_values('timestamp', ascending=True)
    target_row = group.iloc[-1]  # Last row is the target
    lags = group.iloc[:-1]  # All but the last row are lags
    return target_row, lags


def calculate_all_stats(lags, target, suffix=''):
    like_to_comment_ratio = lags['like_count'].mean() / (lags['comments_count'].mean() + 1e-5)
    if target['comments_count'] == 0:
        target['comments_count'] = lags['comments_count'].mean()
    estimate = target['comments_count'] * like_to_comment_ratio

    return {
        'mean_likes' + suffix: lags['like_count'].mean(),
        'max_likes' + suffix: lags['like_count'].max(),
        'median_likes' + suffix: lags['like_count'].median(),
        'min_likes' + suffix: lags['like_count'].min(),
        'mean_comments' + suffix: lags['comments_count'].mean(),
        'max_comments' + suffix: lags['comments_count'].max(),
        'median_comments' + suffix: lags['comments_count'].median(),
        'min_comments' + suffix: lags['comments_count'].min(),
        'like_to_comment_ratio' + suffix: like_to_comment_ratio,
        'estimate_likes' + suffix: estimate
    }


def calculate_media_type_stats(lags, target):
    current_media_type = target['media_type_VIDEO']
    videos_count = lags['media_type_VIDEO'].sum()
    non_videos_count = len(lags) - videos_count
    is_primarily_videoer = 1 if videos_count > non_videos_count else 0
    media_type_lags = lags[lags['media_type_VIDEO'] == current_media_type].copy()

    if len(media_type_lags) == 0 or len(media_type_lags) < 3:
        media_type_lags = lags.copy()  # Fallback if no matching media type

    like_to_comment_ratio = media_type_lags['like_count'].mean() / (media_type_lags['comments_count'].mean() + 1e-5)
    if target['comments_count'] == 0:
        target['comments_count'] = media_type_lags['comments_count'].mean()
    estimate = target['comments_count'] * like_to_comment_ratio

    return {
        'is_primarily_videoer': is_primarily_videoer,
        'mean_likes_media_type': media_type_lags['like_count'].mean(),
        'max_likes_media_type': media_type_lags['like_count'].max(),
        'median_likes_media_type': media_type_lags['like_count'].median(),
        'min_likes_media_type': media_type_lags['like_count'].min(),
        'mean_comments_media_type': media_type_lags['comments_count'].mean(),
        'max_comments_media_type': media_type_lags['comments_count'].max(),
        'median_comments_media_type': media_type_lags['comments_count'].median(),
        'min_comments_media_type': media_type_lags['comments_count'].min(),
        'estimate_likes_media_type': estimate
    }


def create_t_features(lags):
    t_features = {}
    for t in range(1, 34):
        if len(lags) >= t:
            t_features[f't_{t}_likes'] = lags.iloc[-t]['like_count']
        else:
            t_features[f't_{t}_likes'] = 0
    return t_features


# Example: Assuming 'df' is your original DataFrame
df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
grouped = df.groupby('username')

# Now apply the function to add lagged features to the dataset
df_with_lags = add_lagged_features_to_dataset(df)

# Save the resulting DataFrame to a CSV file
df_with_lags.to_csv('df_with_lags.csv', index=False)

print("Lagged features added to dataset and saved as df_with_lags.csv.")


In [None]:
df_with_lags.columns

In [None]:
# Load Dataset
df = pd.read_csv('df_with_lags.csv')  # Replace with your dataset path
columns_to_keep = [
    'hashtags_x', 
    'cleaned_caption', 
    'caption', 
    'tags', 
    'Unnamed: 0', 'biography', 'id_x'
]
print(df[   'hashtags_x'])

In [41]:
import pandas as pd
import demoji
import math


# Load the sentiment mapping table
sentiment_df = pd.read_csv(r'tansu_emojis.csv')

# Create a dictionary for quick lookup of sentiment scores
sentiment_score_map = dict(zip(sentiment_df['Emoji'], sentiment_df['Score']))

# Define a function to calculate weighted sentiment
def calculate_weighted_sentiment(caption):
    emojis = demoji.findall(caption)  # Find all emojis in the caption
    if not emojis:
        return 0  # Return 0 if no emojis are found
    
    total_weighted_sentiment = 0
    total_count = 0
    
    # Calculate weighted sentiment score
    for emoji, desc in emojis.items():
        count = caption.count(emoji)
        sentiment_score = sentiment_score_map.get(emoji)
        
        # Skip if sentiment score is NaN or None
        if sentiment_score is None or math.isnan(sentiment_score):
            continue
        
        total_weighted_sentiment += sentiment_score * count  # Multiply sentiment score by emoji count
        total_count += count  # Add emoji count to total count
    
    # Return the weighted average sentiment score, or 0 if no valid emojis were found
    return total_weighted_sentiment / total_count if total_count else 0

# Process each caption and output a single column with the average sentiment
df['sentiment'] = df['caption'].apply(
    lambda caption: calculate_weighted_sentiment(caption) if isinstance(caption, str) else 0
)


In [42]:
df.to_csv('sentiment_data.csv', index=False)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
df = pd.read_csv("sentiment_data.csv")

# Function to remove words of length less than 5
def remove_short_words(text):
    """
    Remove words with length less than 5 from a given text.
    """
    return " ".join([word for word in text.split() if len(word) >= 5])

# Step 1: Fill NaN values and remove '#' from hashtags
df["cleaned_caption"] = df["cleaned_caption"].fillna("").apply(remove_short_words)
df["hashtags_x"] = df["hashtags_x"].fillna("").str.replace('#', '', regex=False).apply(remove_short_words)

# Step 2: Apply TF-IDF Vectorization
# Define the maximum number of features for captions and hashtags
caption_tfidf_max_features = 300
hashtag_tfidf_max_features = 100

# TF-IDF Vectorizer for captions
tfidf_caption = TfidfVectorizer(max_features=caption_tfidf_max_features)
caption_tfidf_matrix = tfidf_caption.fit_transform(df["cleaned_caption"])

# TF-IDF Vectorizer for hashtags
tfidf_hashtags = TfidfVectorizer(max_features=hashtag_tfidf_max_features)
hashtags_tfidf_matrix = tfidf_hashtags.fit_transform(df["hashtags_x"])

# Step 3: Convert TF-IDF Matrices to DataFrames
caption_tfidf_df = pd.DataFrame(caption_tfidf_matrix.toarray(), columns=tfidf_caption.get_feature_names_out())
hashtags_tfidf_df = pd.DataFrame(hashtags_tfidf_matrix.toarray(), columns=tfidf_hashtags.get_feature_names_out())

# Step 4: Combine TF-IDF Features with Original DataFrame
df = pd.concat([df, caption_tfidf_df.add_prefix("caption_tfidf_"), hashtags_tfidf_df.add_prefix("hashtags_tfidf_")], axis=1)

# Step 5: Verify Results
print(df.head())

# Save the resulting DataFrame (optional)
df.to_csv("tfidf_transformed_data.csv", index=False)
print("TF-IDF transformed data saved to 'tfidf_transformed_data.csv'.")


In [None]:
# Saving the vectorizer after fitting it on training data
import joblib
joblib.dump(tfidf_caption, 'tfidf_vectorizer_captions.pkl')
joblib.dump(tfidf_hashtags, 'tfidf_vectorizer_hashtags.pkl')


## NLP

In [None]:
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import zeyrek

# Initialize Zeyrek MorphAnalyzer
zeyrek = zeyrek.MorphAnalyzer()

# Download necessary NLTK data
nltk.download('stopwords')

# Define the list of holiday-related words
holiday_words = [
    "cumhuriyet", "kutlu", "türkiye", "bayram", "zafer", 
    "kurtuluş", "özgürlük", "şehit", "atatürk", "milli", "egemenlik", 
    "birlik", "beraberlik",
    "anma",  "al bayrak", "100", "23", "millî egemenlik", 
    "23 nisan", "30 ağustos", "19 mayıs", "29 ekim", 
    "zafer bayramı", "cumhuriyet bayramı", "kutlu olsun", "kutlama",
    "kutlamak", "anmak", "birleşmek","başarmak", "kutlanmak"
]

# Create a set for faster lookup
holiday_words_set = set(holiday_words)

# Function to check if any holiday word is in the cleaned hashtags
def check_holiday_words(caption):
    if not isinstance(caption, str):
        return 0
    words = caption.split()
    return 1 if any(word in holiday_words_set for word in words) else 0

# Create a new column to flag the presence of holiday words
df['holiday_flag'] = df['cleaned_caption'].apply(check_holiday_words)

In [None]:
print(df['holiday_flag'].value_counts())

In [47]:
columns_to_keep = [
    'hashtags_x', 
    'cleaned_caption', 
    'caption', 
    'tags', 
    'Unnamed: 0', 'biography', 'id_x'
]
df.drop(columns_to_keep, axis=1, inplace=True)


In [None]:
print(df.select_dtypes(include=['object']).columns)

In [None]:
df.columns

In [None]:
df.to_csv("last.csv")

In [16]:
import pandas as pd
df = pd.read_csv("last.csv")


In [9]:
with open('columns.txt', 'w') as f:
    for i in df.columns:
        f.write(i + '\n')

In [None]:
df['holiday_flag']

In [17]:
df = df.drop(["Unnamed: 0"], axis=1)


In [6]:
# Print the ranges (min and max) of all columns in the dataframe, excluding "timestamp" and "username"
for column in df.columns:
    if column not in ["timestamp", "username"]:
        min_value = df[column].min()
        max_value = df[column].max()
        # print(f"Min: {min_value} | Max: {max_value}")
        if min_value < 0:
            print(f"Column: {column} | Min: {min_value} | Max: {max_value}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot distribution of the target variable
plt.figure(figsize=(10, 6))
sns.histplot(df['like_count'])
plt.title('Distribution of Like Count')
plt.xlabel('Like Count')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Check the distribution of categorical features
categorical_columns = [
    'category_name', 'is_verified', 'is_business_account', 'holiday_flag'
]

for col in categorical_columns:
    print(f"Distribution of {col}:")
    print(df[col].value_counts())
    print("\n")


In [34]:

# Load your dataset
df = pd.read_csv('last.csv')  # Example of loading your dataset
df = df.drop(["Unnamed: 0", "timestamp", "username", "category_name"], axis=1)
df['estimate_likes'] = np.maximum(df['estimate_likes'], df['min_likes'])
df['estimate_likes_media_type'] = np.maximum(df['estimate_likes_media_type'], df['min_likes_media_type'])

In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Assuming df is already loaded

# Step 1: Data Preparation
print("Data shape:", df.shape)

# List of features that should be log-transformed
log_transformed_features = [
    'comments_count', 'follower_count', 'following_count', 'highlight_reel_count',
    'like_count', 'post_count', 'mean_likes', 'max_likes', 'median_likes', 
    'min_likes', 'mean_comments', 'max_comments', 'median_comments', 
    'min_comments', 'estimate_likes', 'mean_likes_media_type', 'max_likes_media_type', 
    'median_likes_media_type', 'min_likes_media_type', 'mean_comments_media_type', 
    'max_comments_media_type', 'median_comments_media_type', 'min_comments_media_type', 
    'estimate_likes_media_type', 't_1_likes', 't_2_likes', 't_3_likes', 't_4_likes', 
    't_5_likes', 't_6_likes', 't_7_likes', 't_8_likes', 't_9_likes', 't_10_likes',
    't_11_likes', 't_12_likes', 't_13_likes', 't_14_likes', 't_15_likes', 't_16_likes', 
    't_17_likes', 't_18_likes', 't_19_likes', 't_20_likes', 't_21_likes', 't_22_likes', 
    't_23_likes', 't_24_likes', 't_25_likes', 't_26_likes', 't_27_likes', 't_28_likes', 
    't_29_likes', 't_30_likes', 't_31_likes', 't_32_likes', 't_33_likes'
]

# Apply log transformation with a small epsilon to avoid log(0)
for feature in log_transformed_features:
    df[feature] = np.log(df[feature] + 1)
print(df['like_count'].value_counts())

# List of binary categorical features
binary_features = ['is_verified', 'is_business_account', 'is_holiday', 'is_professional_account', 'is_lottery', 'media_type_VIDEO']

# Encode binary categorical features using Label Encoding
encoder = LabelEncoder()
for feature in binary_features:
    df[feature] = encoder.fit_transform(df[feature])

# Apply MinMax scaling to all numerical features
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns  # Select all numerical features

scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Split the dataset into features and target
X = df.drop(columns='like_count')  # Replace 'target_variable' with your target column name
y = df['like_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now the dataset is preprocessed and ready for model training


Data shape: (160899, 474)
like_count
2.302585     2153
2.079442     2138
2.639057     2130
2.197225     2102
1.945910     2049
             ... 
11.861693       1
13.140728       1
12.778452       1
10.604057       1
8.029759        1
Name: count, Length: 17901, dtype: int64


In [36]:
print(df['like_count'].value_counts())

like_count
0.104650    2153
0.090141    2138
0.126528    2130
0.097799    2102
0.081458    2049
            ... 
0.726209       1
0.809376       1
0.785820       1
0.644434       1
0.477047       1
Name: count, Length: 17901, dtype: int64


In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Initialize the linear regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the test set (log-transformed predictions)
y_pred_log = model.predict(X_test)

# Reverse the log transformation (use exp to get the original scale)
epsilon = 1  # The value used in log(x + 1)
y_pred_original_log = np.exp(y_pred_log) - epsilon
y_test_original_log = np.exp(y_test) - epsilon

# Now reverse the scaling applied to the features (Descaling)
# For each feature, we need to reverse the scaling:
X_train_original = scaler.inverse_transform(X_train)  # Descale X_train
X_test_original = scaler.inverse_transform(X_test)  # Descale X_test

# Here, we need to use the predicted and true values on their original scale (after descaling and log reversal)
# Calculate error metrics on the original scale
mae = mean_absolute_error(y_test_original_log, y_pred_original_log)
mse = mean_squared_error(y_test_original_log, y_pred_original_log)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_original_log, y_pred_original_log)

# Output the evaluation metrics
print("Model Evaluation Metrics on Original Scale (Descaled):")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² (Coefficient of Determination): {r2}")


ValueError: operands could not be broadcast together with shapes (128719,473) (474,) (128719,473) 

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import pickle

# Load the dataset
df = pd.read_csv("last.csv")
df = df.drop(["Unnamed: 0"], axis=1)

# Step 1: Data Preparation
print("Data shape:", df.shape)

# Define the columns to exclude and the features
excluded_columns = ['timestamp', 'like_count', 'id_x', 'highlight_reel_count',
                    'following_count', 'category_name', 'username',
                    'count_caption', 'id_x']
features = [col for col in df.columns if col not in excluded_columns and "estimate" not in col]
X = df[features]
y = df['like_count']

# Step 2: Apply log transformation to the target variable (log-space training)
y_log = np.log1p(y)  # Log transform the target variable

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.24, random_state=15)

# Step 4: Train the Model with Poisson Regression
best_params = {
    'learning_rate': 0.1,
    'max_depth': 7,
    'n_estimators': 200,
    'subsample': 0.8,
    'objective': 'reg:gamma',  
    'seed': 8,
    'eval_metric': 'mae'  # Set eval_metric here to avoid the warning
}

best_model = XGBRegressor(**best_params)
best_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=True  # No need to include eval_metric here, it's already set in params
)

# Step 5: Predict in log space
y_train_pred_log = best_model.predict(X_train)
y_test_pred_log = best_model.predict(X_test)

# Step 6: Convert predictions back to original scale (inverse log transformation)
y_train_pred = np.expm1(y_train_pred_log)  # Inverse of log1p is expm1
y_test_pred = np.expm1(y_test_pred_log)

# Step 7: Post-process predictions to ensure non-negative values (Poisson regression guarantees non-negativity)
y_train_pred = np.maximum(y_train_pred, 0)
y_test_pred = np.maximum(y_test_pred, 0)

# Calculate MAE for both train and test sets
train_mae = mean_absolute_error(np.expm1(y_train), y_train_pred)  # Apply inverse log transformation to y_train
test_mae = mean_absolute_error(np.expm1(y_test), y_test_pred)      # Apply inverse log transformation to y_test
print(f"Train MAE: {train_mae:.4f}")
print(f"Test MAE: {test_mae:.4f}")

# Step 8: Calculate Log MAE (in log-space)
train_log_mae = mean_absolute_error(y_train, y_train_pred_log)
test_log_mae = mean_absolute_error(y_test, y_test_pred_log)
print(f"Train Log MAE: {train_log_mae:.4f}")
print(f"Test Log MAE: {test_log_mae:.4f}")

# Step 9: Save Predictions (after inverse log transformation)
df_test = df.loc[X_test.index]
df_test['predicted_like_count'] = y_test_pred
df_test.to_csv('final_predictions_logspace.csv', index=False)
print("Final predictions saved to 'final_predictions_logspace.csv'.")

# Step 10: Save the Model
with open('model_logspace.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)
print("Model saved as 'model_logspace.pkl'")


Data shape: (160899, 477)
[0]	validation_0-mae:2.61275
[1]	validation_0-mae:2.50837
[2]	validation_0-mae:2.40410
[3]	validation_0-mae:2.30034
[4]	validation_0-mae:2.19727
[5]	validation_0-mae:2.09534
[6]	validation_0-mae:1.99482
[7]	validation_0-mae:1.89603
[8]	validation_0-mae:1.79941
[9]	validation_0-mae:1.70527
[10]	validation_0-mae:1.61380
[11]	validation_0-mae:1.52529
[12]	validation_0-mae:1.44013
[13]	validation_0-mae:1.35850
[14]	validation_0-mae:1.28068
[15]	validation_0-mae:1.20676
[16]	validation_0-mae:1.13681
[17]	validation_0-mae:1.07083
[18]	validation_0-mae:1.00894
[19]	validation_0-mae:0.95129
[20]	validation_0-mae:0.89750
[21]	validation_0-mae:0.84762
[22]	validation_0-mae:0.80173
[23]	validation_0-mae:0.75955
[24]	validation_0-mae:0.72089
[25]	validation_0-mae:0.68549
[26]	validation_0-mae:0.65344
[27]	validation_0-mae:0.62434
[28]	validation_0-mae:0.59791
[29]	validation_0-mae:0.57414
[30]	validation_0-mae:0.55289
[31]	validation_0-mae:0.53397
[32]	validation_0-mae:0.

In [None]:
# Check for negative predictions
negative_predictions = df_test[df_test['predicted_like_count'] < 0]
print(f"Number of negative predictions: {len(negative_predictions)}")

# Optionally, display some negative predictions
print(negative_predictions[['like_count', 'predicted_like_count']])

# Check for NaN values in predictions
nan_predictions = df_test[df_test['predicted_like_count'].isna()]
print(f"Number of NaN predictions: {len(nan_predictions)}")

# Optionally, display NaN predictions
print(nan_predictions[['like_count', 'predicted_like_count']])


In [None]:
import xgboost as xgb

# Get feature importance
importance = best_model.feature_importances_

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({
    'feature': features,
    'importance': importance
}).sort_values(by='importance', ascending=False)

print("Feature Importance:")
print(feature_importance_df)


In [None]:
df_test.columns

In [None]:
import numpy as np
import pandas as pd

# List of columns
cols = ['like_count', 'predicted_like_count', 'estimate_likes_media_type']

# Create a function to compute MAE
def mae(true_values, predicted_values):
    return np.mean(np.abs(true_values - predicted_values))

# Create a dictionary to store the MAE results
mae_results = {}

# Compute MAE for each pair of columns
for i in range(3):
    for j in range(3):
        if i != j:
            mae_value = mae(df_test[cols[i]], df_test[cols[j]])
            mae_results[f'{cols[i]} vs {cols[j]}'] = mae_value

# Print the MAE results
for pair, mae_value in mae_results.items():
    print(f'MAE between {pair}: {mae_value:.4f}')


In [None]:
import numpy as np
import pandas as pd

# Function to compute MAE
def mae(true_values, predicted_values):
    return np.mean(np.abs(true_values - predicted_values))

# Function to get largest contributors
def get_largest_contributors(true_values, predicted_values, num_contributors=5):
    # Compute absolute errors for each row
    absolute_errors = np.abs(true_values - predicted_values)
    
    # Get the rows with the largest errors
    largest_contributors = absolute_errors.nlargest(num_contributors)
    
    return largest_contributors

# Compute MAE for like_count and predicted_like_count
mae_value = mae(df_test['like_count'], df_test['predicted_like_count'])

# Get the largest contributors
largest_contributors = get_largest_contributors(df_test['like_count'], df_test['predicted_like_count'])

# Output the results
print(f'MAE between like_count and predicted_like_count: {mae_value:.4f}')
print("\nLargest Contributors (Top 5 rows):")
print(largest_contributors)


In [24]:
df_test['residual'] = np.abs(df_test['like_count'] - df_test['predicted_like_count'])

In [None]:
df_test

In [None]:
df_test.sort_values('residual', ascending=False).head(10)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot residuals to see how they're distributed
plt.figure(figsize=(10, 6))
sns.histplot(df_test['residual'], bins=50, color='blue', kde=True)
plt.title("Distribution of Residuals (Prediction Error)")
plt.xlabel("Residual")
plt.ylabel("Frequency")
plt.show()

# Check for large residuals (i.e., cases where the model has a large error)
large_residuals = df_test[df_test['residual'].abs() > 1e6]  # Adjust threshold as necessary
print("Data points with large residuals (model errors):")
print(large_residuals[['like_count', 'predicted_like_count', 'residual']])


In [None]:
import matplotlib.pyplot as plt
import xgboost as xgb

# 1. **Feature Importance using XGBoost's built-in method**
# Visualize the feature importance (it ranks features based on their impact on the model)
plt.figure(figsize=(10, 6))
xgb.plot_importance(best_model, importance_type='weight', max_num_features=10, height=0.8)
plt.title('Top 10 Feature Importance by Weight')
plt.show()

# You can also use 'gain' or 'cover' for other types of importance
plt.figure(figsize=(10, 6))
xgb.plot_importance(best_model, importance_type='gain', max_num_features=10, height=0.8)
plt.title('Top 10 Feature Importance by Gain')
plt.show()


In [None]:
# Define a threshold for 'celebrity' (top 5% by like_count)
celebrity_threshold = df_test['like_count'].quantile(0.999)

# Filter out celebrity data (those with high like_count)
celebrity_data = df_test[df_test['like_count'] >= celebrity_threshold]

# Check residuals for celebrity data
celebrity_residuals = celebrity_data[['like_count', 'predicted_like_count', 'residual']]
print(celebrity_residuals)

# Visualize residuals for celebrities
plt.figure(figsize=(10, 6))
sns.scatterplot(x=celebrity_residuals.index, y=celebrity_residuals['residual'], color='red')
plt.title("Residuals for Celebrity Data (High Like Count)")
plt.xlabel("Index")
plt.ylabel("Residual (Prediction Error)")
plt.show()
