In [1]:
import pandas as pd
import matplotlib.pyplot as plt 

df = pd.read_csv('telecom_cleaned.csv')

In [2]:
df.columns

Index(['circle', 'type_of_connection', 'year', 'month', 'service_provider',
       'value', 'technology'],
      dtype='object')

In [3]:

#  Aggregate features — yearly sum for each technology :

#  Groups the data by both technology and year.
# Calculates the total subscriber value for each technology-year combination.

df['tech_yearly_sum'] = df.groupby(['technology', 'year'])['value'].transform('sum')

#  Average usage per technology
df['tech_avg'] = df.groupby('technology')['value'].transform('mean')

#  Difference feature — change in value from previous year
df = df.sort_values(by=['technology', 'year'])
df['tech_value_diff'] = df.groupby('technology')['value'].diff()

#  Ratio feature — share of technology usage in yearly total
df['tech_yearly_share'] = df['value'] / df['tech_yearly_sum']
#  Missing value flag :
# Checks if technology is missing (NaN).
# Converts True → 1 and False → 0

df['technology_missing'] = df['technology'].isna().astype(int)

# Check results
print(df.head())


                        circle type_of_connection  year  month  \
0  Andaman and Nicobar Islands           wireless  2009  April   
1  Andaman and Nicobar Islands           wireless  2009  April   
2  Andaman and Nicobar Islands           wireless  2009  April   
3  Andaman and Nicobar Islands           wireless  2009  April   
4  Andaman and Nicobar Islands           wireless  2009  April   

  service_provider   value technology  tech_yearly_sum      tech_avg  \
0           Aircel  852468         3G       7091626007  2.179738e+06   
1             BSNL   83332         3G       7091626007  2.179738e+06   
2           Airtel  852468         3G       7091626007  2.179738e+06   
3             HFCL  852468         3G       7091626007  2.179738e+06   
4    Vodafone-Idea  852468         3G       7091626007  2.179738e+06   

   tech_value_diff  tech_yearly_share  technology_missing  
0              NaN           0.000120                   0  
1        -769136.0           0.000012             

In [4]:
df['tech_value_diff'] = df['tech_value_diff'].fillna(0)
df['tech_value_diff'].head()

0         0.0
1   -769136.0
2    769136.0
3         0.0
4         0.0
Name: tech_value_diff, dtype: float64

In [5]:
df.head()

Unnamed: 0,circle,type_of_connection,year,month,service_provider,value,technology,tech_yearly_sum,tech_avg,tech_value_diff,tech_yearly_share,technology_missing
0,Andaman and Nicobar Islands,wireless,2009,April,Aircel,852468,3G,7091626007,2179738.0,0.0,0.00012,0
1,Andaman and Nicobar Islands,wireless,2009,April,BSNL,83332,3G,7091626007,2179738.0,-769136.0,1.2e-05,0
2,Andaman and Nicobar Islands,wireless,2009,April,Airtel,852468,3G,7091626007,2179738.0,769136.0,0.00012,0
3,Andaman and Nicobar Islands,wireless,2009,April,HFCL,852468,3G,7091626007,2179738.0,0.0,0.00012,0
4,Andaman and Nicobar Islands,wireless,2009,April,Vodafone-Idea,852468,3G,7091626007,2179738.0,0.0,0.00012,0


In [6]:
df.tail()

Unnamed: 0,circle,type_of_connection,year,month,service_provider,value,technology,tech_yearly_sum,tech_avg,tech_value_diff,tech_yearly_share,technology_missing
65893,West Bengal,wireline,2025,January,Jio,279380,5G,4347862126,9005452.0,196084.0,6.425687e-05,0
65896,West Bengal,wireline,2025,January,Vodafone-Idea,150,5G,4347862126,9005452.0,-279230.0,3.449971e-08,0
65899,West Bengal,wireline,2025,March,Airtel,86815,5G,4347862126,9005452.0,86665.0,1.996728e-05,0
65903,West Bengal,wireline,2025,March,Jio,290807,5G,4347862126,9005452.0,203992.0,6.688506e-05,0
65906,West Bengal,wireline,2025,March,Vodafone-Idea,150,5G,4347862126,9005452.0,-290657.0,3.449971e-08,0


Lag Features

In [7]:


# Sort data so lags are correct
df = df.sort_values(by=['technology', 'circle', 'service_provider', 'year', 'month'])

# Lag-1: previous month’s value
df['value_lag1'] = df.groupby(['technology', 'circle', 'service_provider'])['value'].shift(1)

# Lag-2: two months before
df['value_lag2'] = df.groupby(['technology', 'circle', 'service_provider'])['value'].shift(2)

# Fill missing lag values (first months) with 0
df['value_lag1'] = df['value_lag1'].fillna(0)
df['value_lag2'] = df['value_lag2'].fillna(0)

print(df.head(10))


                         circle type_of_connection  year     month  \
0   Andaman and Nicobar Islands           wireless  2009     April   
12  Andaman and Nicobar Islands           wireless  2009  February   
24  Andaman and Nicobar Islands           wireless  2009   January   
36  Andaman and Nicobar Islands           wireless  2009     March   
48  Andaman and Nicobar Islands           wireless  2009       May   
2   Andaman and Nicobar Islands           wireless  2009     April   
15  Andaman and Nicobar Islands           wireless  2009  February   
27  Andaman and Nicobar Islands           wireless  2009   January   
39  Andaman and Nicobar Islands           wireless  2009     March   
50  Andaman and Nicobar Islands           wireless  2009       May   

   service_provider   value technology  tech_yearly_sum      tech_avg  \
0            Aircel  852468         3G       7091626007  2.179738e+06   
12           Aircel  852468         3G       7091626007  2.179738e+06   
24        

In [8]:
df.columns 

Index(['circle', 'type_of_connection', 'year', 'month', 'service_provider',
       'value', 'technology', 'tech_yearly_sum', 'tech_avg', 'tech_value_diff',
       'tech_yearly_share', 'technology_missing', 'value_lag1', 'value_lag2'],
      dtype='object')

In [9]:
df.columns

Index(['circle', 'type_of_connection', 'year', 'month', 'service_provider',
       'value', 'technology', 'tech_yearly_sum', 'tech_avg', 'tech_value_diff',
       'tech_yearly_share', 'technology_missing', 'value_lag1', 'value_lag2'],
      dtype='object')

In [10]:
df.to_csv("telecom_feature_engineered.csv")