In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor


# Load data
train_data = pd.read_csv(r"/Users/talalkhan/Documents/Data Sets/Second Challange/train.csv")
test_data = pd.read_csv(r"/Users/talalkhan/Documents/Data Sets/Second Challange/test.csv")
train_data = train_data.drop_duplicates()

# Encode categorical features with LabelEncoder (if needed)
label_encoders = {}
for column in train_data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    train_data[column] = label_encoders[column].fit_transform(train_data[column])
    test_data[column] = label_encoders[column].transform(test_data[column])

# Divide into whole and non-whole
whole_number_rows = train_data[train_data['children_school'] % 1 == 0]
non_whole_number_rows = train_data[train_data['children_school'] % 1 != 0]

# Function to calculate covariance with price_doc for each column in a given dataset
def calculate_covariance_with_price(data):
    covariances = {}
    for column in data.columns:
        if column != 'price_doc':
            covariance = data[[column, 'price_doc']].cov().iloc[0, 1]
            covariances[column] = covariance
    
    # Sort the covariances dictionary by values in ascending order
    sorted_covariances = dict(sorted(covariances.items(), key=lambda item: item[1]))
    return sorted_covariances

# Calculate covariance for whole_number_rows
covariance_whole = calculate_covariance_with_price(whole_number_rows)

# Calculate covariance for non_whole_number_rows
covariance_non_whole = calculate_covariance_with_price(non_whole_number_rows)


# Filter features with covariance < -10000000 or > 1000000000
selected_covariance_whole = {key: value for key, value in covariance_whole.items() if value < -100000000  or value > 10000000000 }
print (selected_covariance_whole)

selected_covariance_non_whole = {key: value for key, value in covariance_non_whole.items() if value < -10000000000  or value > 100000000000 }
print (selected_covariance_non_whole)






# Function to calculate and sort correlations with price_doc for each column in a given dataset
def calculate_and_sort_correlation_with_price(data):
    correlations = {column: data[[column, 'price_doc']].corr().iloc[0, 1] for column in data.columns if column != 'price_doc'}
    sorted_correlations = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
    return sorted_correlations

# Calculate and sort correlation for whole_number_rows
sorted_correlation_whole = calculate_and_sort_correlation_with_price(whole_number_rows)

# Calculate and sort correlation for non_whole_number_rows
sorted_correlation_non_whole = calculate_and_sort_correlation_with_price(non_whole_number_rows)

# Filter features with correlation > 0.1 or < -0.1
selected_correlation_whole = [item for item in sorted_correlation_whole if item[1] > 0.3 or item[1] < -0.2]
print(selected_correlation_whole)

selected_correlation_non_whole = [item for item in sorted_correlation_non_whole if item[1] > 0.015 or item[1] < -0.012]
print(selected_correlation_non_whole)

selected_features_correlation_whole = set([item[0] for item in selected_correlation_whole])
selected_features_covariance_whole = set(selected_covariance_whole.keys())

# Provided list of features
provided_features = [
    "full_sq", "floor", "cafe_count_5000_price_high", "kindergarten_km", "life_sq", "public_transport_station_km",
    "public_transport_station_min_walk", "green_zone_km", "school_km", "water_km", "green_part_500", "fitness_km",
    "preschool_km", "sport_count_500", "railroad_km", "additional_education_km"
]

# Combine all selected and provided features, removing duplicates
all_selected_features = selected_features_correlation_whole.union(selected_features_covariance_whole).union(provided_features)

# Convert the set to a list and sort it for consistency
feature_list_whole = sorted(list(all_selected_features))

# Features from selected_correlation_whole and selected_covariance_whole
selected_features_correlation_non_whole = set([item[0] for item in selected_correlation_non_whole])
selected_features_covariance_non_whole = set(selected_covariance_non_whole.keys())

# Provided list of features
provided_features1 = [
"full_sq"
]

# Combine all selected and provided features, removing duplicates
all_selected_features_non_whole = selected_features_correlation_non_whole.union(selected_features_covariance_non_whole).union(provided_features1)

# Convert the set to a list and sort it for consistency
feature_list_non_whole = sorted(list(all_selected_features_non_whole))

# Print the final list of features
print("Final list of selected features:")
for feature in feature_list_non_whole:
    print(feature)
    
print(len(feature_list_non_whole))


# Prepare train and test data with selected features
X_whole = whole_number_rows[feature_list_whole]
X_non_whole = non_whole_number_rows[feature_list_non_whole]
y_whole = whole_number_rows['price_doc']
y_non_whole = non_whole_number_rows['price_doc']

# Train-test split
X_train_whole, X_val_whole, y_train_whole, y_val_whole = train_test_split(X_whole, y_whole, test_size=0.1, random_state=42)
X_train_non_whole, X_val_non_whole, y_train_non_whole, y_val_non_whole = train_test_split(X_non_whole, y_non_whole, test_size=0.1, random_state=42)


{'area_m': -23858099335215.94, 'sub_area': -269308211.6375193, 'metro_min_walk': -108810817.01734817, 'trc_sqm_500': 10731289550.820986, 'work_all': 12986872168.704714, '16_29_all': 13978403395.378593, 'raion_popul': 22765516911.780304, 'male_f': 32947352725.91563, 'female_f': 37977829639.2107, 'office_sqm_500': 49446917519.27475, 'trc_sqm_1000': 66641729039.167076, 'full_all': 70925587054.43039, 'trc_sqm_1500': 149301900627.71182, 'office_sqm_1000': 194917540476.89062, 'trc_sqm_2000': 279898311008.8841, 'office_sqm_1500': 475798732601.1094, 'trc_sqm_3000': 677735345779.0322, 'office_sqm_2000': 852103962369.7913, 'trc_sqm_5000': 1650558510091.0176, 'office_sqm_3000': 1875214633319.1877, 'office_sqm_5000': 4691740388588.324}
{'full_all': -113261569321.28836, 'office_sqm_1500': -102137605229.06575, 'trc_sqm_3000': -96676523866.28038, 'trc_sqm_500': -77277241403.57695, 'office_sqm_500': -37338695694.68065, '16_29_male': -21706021677.57643, 'female_f': -19668934306.258545, 'office_sqm_5000

In [4]:

# CatBoost parameters
cat_params = {
    'iterations': 100000,
    'depth': 12,
    'learning_rate': 0.05,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'l2_leaf_reg': 0.05,
    'early_stopping_rounds': 40
}

# Create and train CatBoost models
#best_model_whole = CatBoostRegressor(**cat_params)
best_model_whole = RandomForestRegressor(n_estimators=600, 
                              #max_depth=10, 
                              min_samples_split=4, 
                              min_samples_leaf=3, 
                              max_leaf_nodes=None,
                              max_features='sqrt',
                              min_impurity_decrease=0.01,
                              ccp_alpha=0,
                              oob_score=True, 
                              bootstrap=True, 
                              random_state=42,
                              n_jobs=-1
                              )

best_model_whole.fit(X_train_whole, y_train_whole)#, eval_set=(X_val_whole, y_val_whole), verbose=1)


In [5]:

# Define the hyperparameters for the Random Forest Regressor
rf_params = {
    'n_estimators': 400,  # Number of trees in the forest
    'max_depth': 3,      # Maximum depth of each tree
    'random_state': 42,   # Random seed for reproducibility
    'n_jobs': -1,         # Use all available CPU cores for parallel processing
    'warm_start' : True,
    'verbose': 1}

# Create the Random Forest Regressor
best_model_non_whole = RandomForestRegressor(**rf_params)

# Fit the model to your training data
best_model_non_whole.fit(X_train_non_whole, y_train_non_whole)

# Prepare test data with selected features
test_whole_selected = test_data.loc[test_data['children_school'] % 1 == 0, feature_list_whole + ['row ID']]
test_non_whole_selected = test_data.loc[test_data['children_school'] % 1 != 0, feature_list_non_whole + ['row ID']]

# Make predictions on test data
test_predictions_whole = best_model_whole.predict(test_whole_selected.drop(['row ID'], axis=1))
test_predictions_non_whole = best_model_non_whole.predict(test_non_whole_selected.drop(['row ID'], axis=1))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    6.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.0s finished


In [7]:

# Add predictions back to the dataframes
test_whole_selected['price_doc'] = test_predictions_whole
test_non_whole_selected['price_doc'] = test_predictions_non_whole

# Concatenate the results
final_predictions = pd.concat([test_whole_selected[['row ID', 'price_doc']], test_non_whole_selected[['row ID', 'price_doc']]])

# Sort by 'row ID'
final_predictions_sorted = final_predictions.sort_values(by='row ID')

# Output the final predictions
print(final_predictions_sorted)

# Optional: Save to file
final_predictions_sorted.to_csv("submission142_25253.csv", index=False)


       row ID     price_doc
0           1  1.312578e+07
1           2  5.770090e+06
2           3  5.318175e+06
3           4  5.715682e+06
4           5  5.316313e+06
...       ...           ...
77784   77785  5.687485e+07
77785   77786  5.446641e+07
77786   77787  3.473750e+06
77787   77788  3.473750e+06
77788   77789  3.473750e+06

[77789 rows x 2 columns]
