In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('dataset.csv')

# Drop rows with missing values
df = df.dropna()

# Convert date columns to datetime
df['created_at'] = pd.to_datetime(df['created_at'])
df['sale_time'] = pd.to_datetime(df['sale_time'])

# Define STR (Improved)
df['is_sold'] = df['sale_time'].notnull().astype(int)
cutoff_date = df['created_at'].max() - pd.Timedelta(days=30)
df['had_chance_to_sell'] = df['created_at'] < cutoff_date
adjusted_str = df.loc[df['had_chance_to_sell'], 'is_sold'].sum() / df.loc[df['had_chance_to_sell'], 'is_sold'].count()

print(f"Adjusted Sell-Through Rate (STR): {adjusted_str:.4f}")


# Select only the numeric columns for correlation analysis
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
numeric_data = df[numeric_cols]

# Calculate the correlation matrix
correlations = numeric_data.corr()

# Filter out highly correlated features (correlation > 0.8)
high_corr_features = [column for column, values in correlations.iterrows() if any(abs(values) > 0.8) and column != values.idxmax()]

# Print correlation matrix
print("Correlation Matrix:")
print(correlations)

# Print high correlation features
print("\nHighly Correlated Features:")
print(high_corr_features)

# Define initial features to keep
features_to_keep = [
    'listing_price_eur_fixed',
    'positive_feedback_rate',
    'listing_quality_string',
    'total_positive_feedback_count',
    'total_negative_feedback_count',
    'some_other_feature1',  # Replace with actual feature name
    'some_other_feature2'   # Replace with actual feature name
]

# Filter out high correlations from the list of features to keep
final_features = [column for column in numeric_data.columns if column not in high_corr_features and column in features_to_keep]

# If the final selected features are less than 7, add more features
if len(final_features) < 7:
    remaining_features = [column for column in numeric_data.columns if column not in high_corr_features and column not in final_features]
    final_features += remaining_features[:7 - len(final_features)]

print("\nFiltered dataset saved to 'filtered_dataset.csv'.")



Adjusted Sell-Through Rate (STR): nan
Correlation Matrix:
                               Unnamed: 0  listing_price_eur_fixed  \
Unnamed: 0                            1.0                     -1.0   
listing_price_eur_fixed              -1.0                      1.0   
gmv_eur_fixed                         1.0                     -1.0   
suggested_price_maximum              -1.0                      1.0   
lister_nth_listing                    1.0                     -1.0   
total_positive_feedback_count         1.0                     -1.0   
total_negative_feedback_count         1.0                     -1.0   
window_items_listed                   NaN                      NaN   
window_items_bought                   1.0                     -1.0   
window_items_sold                     1.0                     -1.0   
is_sold                               NaN                      NaN   

                               gmv_eur_fixed  suggested_price_maximum  \
Unnamed: 0                  

  adjusted_str = df.loc[df['had_chance_to_sell'], 'is_sold'].sum() / df.loc[df['had_chance_to_sell'], 'is_sold'].count()
