In [41]:
import pandas as pd

df = pd.read_csv("telecom_feature_selected.csv")

In [42]:
df.head()

Unnamed: 0.1,Unnamed: 0,circle,year,month,service_provider,value,technology,tech_yearly_sum,tech_avg,tech_value_diff,tech_yearly_share,technology_missing,value_lag1,value_lag2,churn_change,churn
0,0,Andaman and Nicobar Islands,2009,April,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0,0.0,0.0,0.0,0
1,1,Andaman and Nicobar Islands,2009,February,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0,852468.0,0.0,0.0,0
2,2,Andaman and Nicobar Islands,2009,January,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0,852468.0,852468.0,0.0,0
3,3,Andaman and Nicobar Islands,2009,March,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0,852468.0,852468.0,0.0,0
4,4,Andaman and Nicobar Islands,2009,May,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0,852468.0,852468.0,0.0,0


In [43]:
df = df.drop(columns = ['Unnamed: 0'])

In [44]:
df.tail()

Unnamed: 0,circle,year,month,service_provider,value,technology,tech_yearly_sum,tech_avg,tech_value_diff,tech_yearly_share,technology_missing,value_lag1,value_lag2,churn_change,churn
65886,West Bengal,2025,February,Vodafone-Idea,150,5G,4347862126,9005452.0,-284090.0,3.449971e-08,0,11722068.0,150.0,-11721918.0,1
65887,West Bengal,2025,January,Vodafone-Idea,11742282,5G,4347862126,9005452.0,-12387806.0,0.002700702,0,150.0,11722068.0,11742132.0,0
65888,West Bengal,2025,January,Vodafone-Idea,150,5G,4347862126,9005452.0,-279230.0,3.449971e-08,0,11742282.0,150.0,-11742132.0,1
65889,West Bengal,2025,March,Vodafone-Idea,11620668,5G,4347862126,9005452.0,-12774587.0,0.002672731,0,150.0,11742282.0,11620518.0,0
65890,West Bengal,2025,March,Vodafone-Idea,150,5G,4347862126,9005452.0,-290657.0,3.449971e-08,0,11620668.0,150.0,-11620518.0,1


In [45]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}  # to save each encoder

for col in ['circle', 'technology', 'service_provider', 'month']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # save encoder for later decoding


In [46]:
df.head()

Unnamed: 0,circle,year,month,service_provider,value,technology,tech_yearly_sum,tech_avg,tech_value_diff,tech_yearly_share,technology_missing,value_lag1,value_lag2,churn_change,churn
0,0,2009,0,1,852468,0,7091626007,2179738.0,0.0,0.00012,0,0.0,0.0,0.0,0
1,0,2009,3,1,852468,0,7091626007,2179738.0,0.0,0.00012,0,852468.0,0.0,0.0,0
2,0,2009,4,1,852468,0,7091626007,2179738.0,0.0,0.00012,0,852468.0,852468.0,0.0,0
3,0,2009,7,1,852468,0,7091626007,2179738.0,0.0,0.00012,0,852468.0,852468.0,0.0,0
4,0,2009,8,1,852468,0,7091626007,2179738.0,0.0,0.00012,0,852468.0,852468.0,0.0,0


In [47]:
from sklearn.ensemble import RandomForestClassifier

X = df.drop(columns=['churn', 'churn_change','value'])  # value is not predictive
y = df['churn']

model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X, y)

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

print("\nFeature Importance:\n", importance_df)



Feature Importance:
                Feature  Importance
0           value_lag1    0.403232
1    tech_yearly_share    0.219572
2           value_lag2    0.119942
3      tech_value_diff    0.089285
4     service_provider    0.047325
5                month    0.038835
6      tech_yearly_sum    0.029586
7                 year    0.023677
8               circle    0.017992
9             tech_avg    0.005727
10          technology    0.004827
11  technology_missing    0.000000


In [48]:
# Choose a threshold (e.g., >0.01 importance)
selected_features = importance_df[importance_df['Importance'] > 0.01]['Feature'].tolist()

# Always add churn back
df_selected = df[selected_features + ['churn']]


In [49]:
df_selected.tail()

Unnamed: 0,value_lag1,tech_yearly_share,value_lag2,tech_value_diff,service_provider,month,tech_yearly_sum,year,circle,churn
65886,11722068.0,3.449971e-08,150.0,-284090.0,26,3,4347862126,2025,29,1
65887,150.0,0.002700702,11722068.0,-12387806.0,26,4,4347862126,2025,29,0
65888,11742282.0,3.449971e-08,150.0,-279230.0,26,4,4347862126,2025,29,1
65889,150.0,0.002672731,11742282.0,-12774587.0,26,7,4347862126,2025,29,0
65890,11620668.0,3.449971e-08,150.0,-290657.0,26,7,4347862126,2025,29,1


In [50]:
for col in ['circle', 'technology', 'service_provider', 'month']:
    df[col] = label_encoders[col].inverse_transform(df[col])

In [51]:
df.head()

Unnamed: 0,circle,year,month,service_provider,value,technology,tech_yearly_sum,tech_avg,tech_value_diff,tech_yearly_share,technology_missing,value_lag1,value_lag2,churn_change,churn
0,Andaman and Nicobar Islands,2009,April,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0,0.0,0.0,0.0,0
1,Andaman and Nicobar Islands,2009,February,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0,852468.0,0.0,0.0,0
2,Andaman and Nicobar Islands,2009,January,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0,852468.0,852468.0,0.0,0
3,Andaman and Nicobar Islands,2009,March,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0,852468.0,852468.0,0.0,0
4,Andaman and Nicobar Islands,2009,May,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0,852468.0,852468.0,0.0,0


In [52]:
df.to_csv("selected_features.csv", index=False)