In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

In [2]:
profile_scores = pd.read_csv('profile_score_updated_normalized.csv')
connectivity_scores = pd.read_csv('connectivity_influence_scaled.csv')
fraud_scores = pd.read_csv('fraud_score_updated_normalized.csv')

In [3]:
profile_scores.head()

Unnamed: 0,ID,profile_score_updated_normalized
0,0,0.437532
1,1,0.937482
2,2,0.874973
3,3,0.687475
4,4,0.687506


In [4]:
connectivity_scores.head()

Unnamed: 0,ID,connectivity_influence_scaled
0,0,0.084637
1,1,0.239201
2,2,0.309499
3,3,0.138785
4,4,0.437985


In [5]:
fraud_scores.head()

Unnamed: 0,ID,fraud_score_updated_normalized
0,0.0,0.017983
1,1.0,0.103368
2,2.0,0.152717
3,3.0,0.986358
4,4.0,0.062396


In [6]:
# Concatenating all three datasets using ID:
merged_df = pd.merge(fraud_scores, profile_scores, on='ID', how='inner')
final_merged_df = pd.merge(merged_df, connectivity_scores, on='ID', how='inner')

In [7]:
final_merged_df.head()

Unnamed: 0,ID,fraud_score_updated_normalized,profile_score_updated_normalized,connectivity_influence_scaled
0,0.0,0.017983,0.437532,0.084637
1,1.0,0.103368,0.937482,0.239201
2,2.0,0.152717,0.874973,0.309499
3,3.0,0.986358,0.687475,0.138785
4,4.0,0.062396,0.687506,0.437985


In [8]:
final_merged_df.to_csv('merged.csv', header=True)

In [9]:
features = final_merged_df[['fraud_score_updated_normalized', 'profile_score_updated_normalized', 'connectivity_influence_scaled']]
initial_weights = np.ones(features.shape[1])

In [10]:
final_merged_df['initial_score'] = features.dot(initial_weights)

In [11]:
X = features
y = final_merged_df['initial_score']

In [12]:
# Precautions (removing nan and inf values)
X_cleaned = X.replace([np.inf, -np.inf], np.nan).dropna()
y_cleaned = y.loc[X_cleaned.index]

In [13]:
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_cleaned, y_cleaned)

In [14]:
feature_importances = tree_model.feature_importances_

In [15]:
for feature, importance in zip(features.columns, feature_importances):
    print(f'{feature}: {importance}')

fraud_score_updated_normalized: 0.4337176603080302
profile_score_updated_normalized: 0.5452057964221559
connectivity_influence_scaled: 0.02107654326981386


In [16]:
final_merged_df['updated_score'] = X_cleaned.dot(feature_importances)

In [17]:
final_merged_df.head()

Unnamed: 0,ID,fraud_score_updated_normalized,profile_score_updated_normalized,connectivity_influence_scaled,initial_score,updated_score
0,0.0,0.017983,0.437532,0.084637,0.540151,0.248128
1,1.0,0.103368,0.937482,0.239201,1.280051,0.560995
2,2.0,0.152717,0.874973,0.309499,1.337189,0.5498
3,3.0,0.986358,0.687475,0.138785,1.812618,0.805542
4,4.0,0.062396,0.687506,0.437985,1.187886,0.411125


In [19]:
# Exporting Satya's analysis
output_df = final_merged_df[['ID', 'updated_score']]

# Saving the dataframe to a CSV file
output_file_path = 'satya_analysis.csv'
output_df.to_csv(output_file_path, index=False)