In [165]:
from dotenv import load_dotenv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import sys
sys.path.append("../scripts/")  # Add the scripts folder to the path


In [166]:
from load_data import load_data_from_postgres, load_data_using_sqlalchemy
from sql_queries import execute_telecom_queries


In [167]:
# Load environment variables from .env file
load_dotenv()

# Fetch database connection parameters from environment variables
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

In [168]:
# Define your SQL query
query = "SELECT * FROM xdr_data;"

# Load data from PostgreSQL using SQLAlchemy
df = load_data_using_sqlalchemy(query)

# Display the first few rows of the dataframe
if df is not None:
    print("Successfully loaded the data")
else:
    print("Failed to load data.")


Successfully loaded the data


In [169]:
# Display the first few rows
df.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,4/4/2019 12:01,770.0,4/25/2019 14:35,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,1.311448e+19,4/9/2019 13:04,235.0,4/25/2019 8:15,606.0,1365104.0,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,1.311448e+19,4/9/2019 17:42,1.0,4/25/2019 11:58,652.0,1361762.0,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,395630.0,410692588.0,4215763.0,27883638.0,279807335.0
3,1.311448e+19,4/10/2019 0:31,486.0,4/25/2019 7:36,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
4,1.311448e+19,4/12/2019 20:10,565.0,4/25/2019 10:40,954.0,1089009.0,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,15259380.0,18962873.0,17124581.0,415218.0,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


In [170]:
df.size

8250055

In [171]:
df.shape

(150001, 55)

In [172]:
df.dtypes

Bearer Id                                   float64
Start                                        object
Start ms                                    float64
End                                          object
End ms                                      float64
Dur. (ms)                                   float64
IMSI                                        float64
MSISDN/Number                               float64
IMEI                                        float64
Last Location Name                           object
Avg RTT DL (ms)                             float64
Avg RTT UL (ms)                             float64
Avg Bearer TP DL (kbps)                     float64
Avg Bearer TP UL (kbps)                     float64
TCP DL Retrans. Vol (Bytes)                 float64
TCP UL Retrans. Vol (Bytes)                 float64
DL TP < 50 Kbps (%)                         float64
50 Kbps < DL TP < 250 Kbps (%)              float64
250 Kbps < DL TP < 1 Mbps (%)               float64
DL TP > 1 Mb

In [173]:
# List of columns to analyze
columns = ['Last Location Name', 'Handset Manufacturer', 'Handset Type']

# Calculate the number of unique values for each column
unique_values = df[columns].nunique()

# Display the result
print(unique_values)


Last Location Name      45547
Handset Manufacturer      170
Handset Type             1396
dtype: int64


In [174]:
# Check for differences within a small tolerance
tolerance = 1e-6
differences = df[abs(df['Dur. (ms).1'] - df['Dur. (ms)']) > tolerance]

print(f"Number of differing rows within tolerance: {len(differences)}")



Number of differing rows within tolerance: 150000


In [175]:
print(df.columns)



Index(['Bearer Id', 'Start', 'Start ms', 'End', 'End ms', 'Dur. (ms)', 'IMSI',
       'MSISDN/Number', 'IMEI', 'Last Location Name', 'Avg RTT DL (ms)',
       'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)',
       'DL TP < 50 Kbps (%)', '50 Kbps < DL TP < 250 Kbps (%)',
       '250 Kbps < DL TP < 1 Mbps (%)', 'DL TP > 1 Mbps (%)',
       'UL TP < 10 Kbps (%)', '10 Kbps < UL TP < 50 Kbps (%)',
       '50 Kbps < UL TP < 300 Kbps (%)', 'UL TP > 300 Kbps (%)',
       'HTTP DL (Bytes)', 'HTTP UL (Bytes)', 'Activity Duration DL (ms)',
       'Activity Duration UL (ms)', 'Dur. (ms).1', 'Handset Manufacturer',
       'Handset Type', 'Nb of sec with 125000B < Vol DL',
       'Nb of sec with 1250B < Vol UL < 6250B',
       'Nb of sec with 31250B < Vol DL < 125000B',
       'Nb of sec with 37500B < Vol UL',
       'Nb of sec with 6250B < Vol DL < 31250B',
       'Nb of sec with 6250B < Vol UL < 37500B',


In [176]:
scaling_factor = df['Dur. (ms).1'] / df['Dur. (ms)']
print(scaling_factor.unique())


[1000.00048913 1000.00027177 1000.00047806 ... 1000.00536392 1000.00644469
           nan]


In [177]:
# Drop Dur. (ms) and keep Dur. (ms).1
df.drop(columns=['Dur. (ms)'], inplace=True)  

In [178]:
print(df.columns)

Index(['Bearer Id', 'Start', 'Start ms', 'End', 'End ms', 'IMSI',
       'MSISDN/Number', 'IMEI', 'Last Location Name', 'Avg RTT DL (ms)',
       'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)',
       'DL TP < 50 Kbps (%)', '50 Kbps < DL TP < 250 Kbps (%)',
       '250 Kbps < DL TP < 1 Mbps (%)', 'DL TP > 1 Mbps (%)',
       'UL TP < 10 Kbps (%)', '10 Kbps < UL TP < 50 Kbps (%)',
       '50 Kbps < UL TP < 300 Kbps (%)', 'UL TP > 300 Kbps (%)',
       'HTTP DL (Bytes)', 'HTTP UL (Bytes)', 'Activity Duration DL (ms)',
       'Activity Duration UL (ms)', 'Dur. (ms).1', 'Handset Manufacturer',
       'Handset Type', 'Nb of sec with 125000B < Vol DL',
       'Nb of sec with 1250B < Vol UL < 6250B',
       'Nb of sec with 31250B < Vol DL < 125000B',
       'Nb of sec with 37500B < Vol UL',
       'Nb of sec with 6250B < Vol DL < 31250B',
       'Nb of sec with 6250B < Vol UL < 37500B',
       'Nb of

In [179]:
#Rename the Remaining Column
df.rename(columns={'Dur. (ms).1': 'Dur. (ms)'}, inplace=True)
print(df.columns)


Index(['Bearer Id', 'Start', 'Start ms', 'End', 'End ms', 'IMSI',
       'MSISDN/Number', 'IMEI', 'Last Location Name', 'Avg RTT DL (ms)',
       'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)',
       'DL TP < 50 Kbps (%)', '50 Kbps < DL TP < 250 Kbps (%)',
       '250 Kbps < DL TP < 1 Mbps (%)', 'DL TP > 1 Mbps (%)',
       'UL TP < 10 Kbps (%)', '10 Kbps < UL TP < 50 Kbps (%)',
       '50 Kbps < UL TP < 300 Kbps (%)', 'UL TP > 300 Kbps (%)',
       'HTTP DL (Bytes)', 'HTTP UL (Bytes)', 'Activity Duration DL (ms)',
       'Activity Duration UL (ms)', 'Dur. (ms)', 'Handset Manufacturer',
       'Handset Type', 'Nb of sec with 125000B < Vol DL',
       'Nb of sec with 1250B < Vol UL < 6250B',
       'Nb of sec with 31250B < Vol DL < 125000B',
       'Nb of sec with 37500B < Vol UL',
       'Nb of sec with 6250B < Vol DL < 31250B',
       'Nb of sec with 6250B < Vol UL < 37500B',
       'Nb of s

In [180]:
# Convert ID-like fields to strings or objects
id_columns = ['Bearer Id', 'IMSI', 'MSISDN/Number', 'IMEI']
df[id_columns] = df[id_columns].astype('object')

# Convert timestamp fields to datetime
timestamp_columns = ['Start', 'End']
df[timestamp_columns] = df[timestamp_columns].apply(pd.to_datetime)


# Convert text-based columns to category if appropriate
text_columns = ['Handset Manufacturer', 'Handset Type']
df[text_columns] = df[text_columns].astype('category')



In [181]:
df.dtypes

Bearer Id                                           object
Start                                       datetime64[ns]
Start ms                                           float64
End                                         datetime64[ns]
End ms                                             float64
IMSI                                                object
MSISDN/Number                                       object
IMEI                                                object
Last Location Name                                  object
Avg RTT DL (ms)                                    float64
Avg RTT UL (ms)                                    float64
Avg Bearer TP DL (kbps)                            float64
Avg Bearer TP UL (kbps)                            float64
TCP DL Retrans. Vol (Bytes)                        float64
TCP UL Retrans. Vol (Bytes)                        float64
DL TP < 50 Kbps (%)                                float64
50 Kbps < DL TP < 250 Kbps (%)                     float

In [182]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 54 columns):
 #   Column                                    Non-Null Count   Dtype         
---  ------                                    --------------   -----         
 0   Bearer Id                                 149010 non-null  object        
 1   Start                                     150000 non-null  datetime64[ns]
 2   Start ms                                  150000 non-null  float64       
 3   End                                       150000 non-null  datetime64[ns]
 4   End ms                                    150000 non-null  float64       
 5   IMSI                                      149431 non-null  object        
 6   MSISDN/Number                             148935 non-null  object        
 7   IMEI                                      149429 non-null  object        
 8   Last Location Name                        148848 non-null  object        
 9   Avg RTT DL (ms)

In [183]:
df.describe()

Unnamed: 0,Start,Start ms,End,End ms,Avg RTT DL (ms),Avg RTT UL (ms),Avg Bearer TP DL (kbps),Avg Bearer TP UL (kbps),TCP DL Retrans. Vol (Bytes),TCP UL Retrans. Vol (Bytes),...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
count,150000,150000.0,150000,150000.0,122172.0,122189.0,150000.0,150000.0,61855.0,53352.0,...,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150000.0,150000.0
mean,2019-04-26 12:36:09.616800256,499.1882,2019-04-27 17:39:38.703200256,498.80088,109.795706,17.662883,13300.045927,1770.428647,20809910.0,759658.7,...,11634070.0,11009410.0,11626850.0,11001750.0,422044700.0,8288398.0,421100500.0,8264799.0,41121210.0,454643400.0
min,2019-04-04 12:01:00,0.0,2019-04-24 22:59:00,0.0,0.0,0.0,0.0,0.0,2.0,1.0,...,53.0,105.0,42.0,35.0,2516.0,59.0,3290.0,148.0,2866892.0,7114041.0
25%,2019-04-25 00:26:00,250.0,2019-04-26 07:30:00,251.0,32.0,2.0,43.0,47.0,35651.5,4694.75,...,5833501.0,5517965.0,5777156.0,5475981.0,210473300.0,4128476.0,210186900.0,4145943.0,33222010.0,243106800.0
50%,2019-04-26 08:51:00,499.0,2019-04-27 23:02:00,500.0,45.0,5.0,63.0,63.0,568730.0,20949.5,...,11616020.0,11013450.0,11642220.0,10996380.0,423408100.0,8291208.0,421803000.0,8267071.0,41143310.0,455841100.0
75%,2019-04-27 18:18:15,749.0,2019-04-29 07:15:00,750.0,70.0,15.0,19710.75,1120.0,3768308.0,84020.25,...,17448520.0,16515560.0,17470480.0,16507270.0,633174200.0,12431620.0,631691800.0,12384150.0,49034240.0,665705500.0
max,2019-04-29 07:28:00,999.0,2019-04-30 23:59:00,999.0,96923.0,7120.0,378160.0,58613.0,4294426000.0,2908226000.0,...,23259100.0,22011960.0,23259190.0,22011960.0,843441900.0,16558790.0,843442500.0,16558820.0,78331310.0,902969600.0
std,,288.611834,,288.097653,619.782739,84.793524,23971.878541,4625.3555,182566500.0,26453050.0,...,6710569.0,6345423.0,6725218.0,6359490.0,243967500.0,4782700.0,243205000.0,4769004.0,11276390.0,244142900.0


In [184]:
df[['Handset Manufacturer', 'Handset Type']].describe()

Unnamed: 0,Handset Manufacturer,Handset Type
count,149429,149429
unique,170,1396
top,Apple,Huawei B528S-23A
freq,59565,19752


In [185]:
#Dropping Columns with Single Value
data_single=df.copy()
single_value_columns = pd.DataFrame(data_single.apply(lambda x: len(x.value_counts()), axis=0), columns=['SingleValueColumn'])
drop_single_value_columns = list(single_value_columns.loc[single_value_columns['SingleValueColumn']==1].index)
print('Columns which have just a single value => \n\n' + str(drop_single_value_columns))
df.drop(drop_single_value_columns, axis=1, inplace=True)
print('\n\nRemaining Columns => ' + str(len(df.columns))) 

Columns which have just a single value => 

[]


Remaining Columns => 54


In [186]:
#dropping Columns with All Different Values
data_diff=df.copy()
alldiff_value_columns = pd.DataFrame(data_diff.apply(lambda x: len(x.value_counts()), axis=0), columns=['AllDiffValueColumn'])
drop_alldiff_value_columns = list(alldiff_value_columns.loc[alldiff_value_columns['AllDiffValueColumn']==len(df.index)].index)
print('Columns which have all values different => \n\n' + str(drop_alldiff_value_columns))
df.drop(drop_alldiff_value_columns, axis=1, inplace=True)
print('\n\nRemaining Columns => ' + str(len(df.columns)))

Columns which have all values different => 

[]


Remaining Columns => 54


In [187]:
# Check for missing values
df.isnull().sum()

Bearer Id                                      991
Start                                            1
Start ms                                         1
End                                              1
End ms                                           1
IMSI                                           570
MSISDN/Number                                 1066
IMEI                                           572
Last Location Name                            1153
Avg RTT DL (ms)                              27829
Avg RTT UL (ms)                              27812
Avg Bearer TP DL (kbps)                          1
Avg Bearer TP UL (kbps)                          1
TCP DL Retrans. Vol (Bytes)                  88146
TCP UL Retrans. Vol (Bytes)                  96649
DL TP < 50 Kbps (%)                            754
50 Kbps < DL TP < 250 Kbps (%)                 754
250 Kbps < DL TP < 1 Mbps (%)                  754
DL TP > 1 Mbps (%)                             754
UL TP < 10 Kbps (%)            

In [188]:
#Calculates the percentage of missing values in each column and sorts them in descending order
round(100*(df.isnull().sum(axis=0)/len(df.index)), 2).sort_values(ascending=False)



Nb of sec with 37500B < Vol UL              86.84
Nb of sec with 6250B < Vol UL < 37500B      74.56
Nb of sec with 125000B < Vol DL             65.02
TCP UL Retrans. Vol (Bytes)                 64.43
Nb of sec with 31250B < Vol DL < 125000B    62.39
Nb of sec with 1250B < Vol UL < 6250B       61.93
Nb of sec with 6250B < Vol DL < 31250B      58.88
TCP DL Retrans. Vol (Bytes)                 58.76
HTTP UL (Bytes)                             54.54
HTTP DL (Bytes)                             54.32
Avg RTT DL (ms)                             18.55
Avg RTT UL (ms)                             18.54
Last Location Name                           0.77
MSISDN/Number                                0.71
Bearer Id                                    0.66
UL TP < 10 Kbps (%)                          0.53
50 Kbps < UL TP < 300 Kbps (%)               0.53
10 Kbps < UL TP < 50 Kbps (%)                0.53
UL TP > 300 Kbps (%)                         0.53
Nb of sec with Vol UL < 1250B                0.53


In [189]:
# Count of duplicates
df.duplicated().value_counts()



False    150001
Name: count, dtype: int64

In [190]:
#Counts the number of missing values in each row
df.isnull().sum(axis=1)



0         10
1         10
2         12
3         12
4         12
          ..
149996    10
149997    10
149998    10
149999    10
150000    40
Length: 150001, dtype: int64

In [191]:
#Calculates the percentage of missing values in each row and sorts them in descending order.
round(100*(df.isnull().sum(axis=1)/len(df.index)), 2).sort_values(ascending=False)



150000    0.03
23552     0.02
13036     0.02
31166     0.02
34183     0.02
          ... 
84648     0.00
84645     0.00
84641     0.00
84670     0.00
84669     0.00
Length: 150001, dtype: float64

In [192]:
#Filter rows with more than 17 missing values
df[df.isnull().sum(axis=1) >17 ]


Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,IMSI,MSISDN/Number,IMEI,Last Location Name,Avg RTT DL (ms),...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
4207,,2019-04-24 14:50:00,274.0,2019-04-25 00:21:00,514.0,,,,,,...,1.916335e+07,1.676094e+07,1.589242e+06,1.720474e+07,1.156853e+07,6.152100e+05,4.771721e+08,1.324574e+07,52556607.0,45044979.0
5457,,2019-04-24 10:06:00,584.0,2019-04-25 00:11:00,358.0,,,,,,...,1.525902e+07,2.059878e+07,3.790427e+06,1.912646e+07,6.688993e+08,1.499422e+07,2.797914e+08,7.600203e+06,63878179.0,696948411.0
6085,7277825654249889792.0,2019-04-24 11:18:00,538.0,2019-04-25 00:08:00,339.0,208201904493131.0,33628784076.0,35725208725710.0,,,...,1.832995e+07,9.006635e+06,1.265831e+07,6.917377e+06,3.174031e+08,1.084563e+07,2.215847e+08,5.030524e+06,34092792.0,358829471.0
6985,,2019-04-24 13:02:00,585.0,2019-04-25 04:08:00,29.0,,,,,,...,1.536046e+07,1.525185e+07,2.237790e+07,7.622937e+06,4.227835e+08,6.744825e+06,5.208940e+07,8.204020e+06,40030355.0,470533304.0
6993,7349883248194010112.0,2019-04-24 13:02:00,893.0,2019-04-24 23:52:00,111.0,208201715253178.0,33660389345.0,86778702545699.0,,,...,2.650696e+06,2.246060e+05,2.785876e+06,2.034908e+07,2.917886e+08,1.731621e+06,4.514296e+08,3.408783e+06,25987241.0,302086505.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148810,,2019-04-29 07:14:00,456.0,2019-04-30 06:23:00,492.0,,,,,,...,1.560789e+07,1.565180e+07,7.191123e+06,9.125542e+06,5.150658e+08,1.422306e+07,6.023466e+08,1.605611e+06,45283727.0,549026287.0
149059,,2019-04-29 07:18:00,358.0,2019-04-30 00:11:00,376.0,,,,,,...,6.873677e+06,1.427194e+07,1.365214e+07,1.852828e+07,6.063120e+08,1.483046e+07,5.459522e+08,1.882888e+06,54158393.0,634174785.0
149090,7349883264100840448.0,2019-04-29 07:18:00,498.0,2019-04-30 00:14:00,900.0,208202101297302.0,,35962008247199.0,T14478A,,...,1.472172e+07,1.180267e+07,1.427289e+07,1.694177e+07,2.491397e+07,1.422218e+07,1.469236e+08,6.590809e+06,52857876.0,61992006.0
149206,,2019-04-29 07:20:00,182.0,2019-04-30 00:24:00,834.0,,,,,,...,2.211622e+07,1.838540e+07,1.015965e+07,4.973486e+06,4.375109e+08,1.588545e+07,2.091552e+08,8.330388e+06,51913246.0,480370571.0


In [None]:
#Drop Rows with Too Many Missing Values
threshold = 17
df = df[df.isnull().sum(axis=1) <= threshold]
df.shape



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 44 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   Bearer Id                       149010 non-null  object        
 1   Start                           150000 non-null  datetime64[ns]
 2   Start ms                        150000 non-null  float64       
 3   End                             150000 non-null  datetime64[ns]
 4   End ms                          150000 non-null  float64       
 5   IMSI                            149431 non-null  object        
 6   MSISDN/Number                   148935 non-null  object        
 7   IMEI                            149429 non-null  object        
 8   Last Location Name              148848 non-null  object        
 9   Avg RTT DL (ms)                 122172 non-null  float64       
 10  Avg RTT UL (ms)                 122189 non-null  float64

In [193]:
# Columns with > 50% missing values
high_missing_cols = [
    "Nb of sec with 37500B < Vol UL",
    "Nb of sec with 6250B < Vol UL < 37500B",
    "Nb of sec with 125000B < Vol DL",
    "TCP UL Retrans. Vol (Bytes)",
    "Nb of sec with 31250B < Vol DL < 125000B",
    "Nb of sec with 1250B < Vol UL < 6250B",
    "Nb of sec with 6250B < Vol DL < 31250B",
    "TCP DL Retrans. Vol (Bytes)",
    "HTTP UL (Bytes)",
    "HTTP DL (Bytes)"
]
# Drop unnecessary columns
df.drop(columns=high_missing_cols, inplace=True)

# Verify the remaining columns
print(df.info())










(149841, 54)

In [None]:
num_rows_with_many_missing = (df.isnull().sum(axis=1) > threshold).sum()
print(f"Rows with more than {threshold} missing values: {num_rows_with_many_missing}")


In [159]:
# Handle Missing Values
df.fillna(df.mean(numeric_only=True), inplace=True)

In [160]:
# Treating Outliers - Replace with Mean
#Outliers capped using 99th percentile and replaced with mean values
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    df[col] = np.where(df[col] > df[col].quantile(0.99), df[col].mean(), df[col])




In [161]:
# Check for missing values
df.isnull().sum()

Bearer Id                          990
Start                                0
Start ms                             0
End                                  0
End ms                               0
IMSI                               569
MSISDN/Number                     1065
IMEI                               571
Last Location Name                1152
Avg RTT DL (ms)                      0
Avg RTT UL (ms)                      0
Avg Bearer TP DL (kbps)              0
Avg Bearer TP UL (kbps)              0
DL TP < 50 Kbps (%)                  0
50 Kbps < DL TP < 250 Kbps (%)       0
250 Kbps < DL TP < 1 Mbps (%)        0
DL TP > 1 Mbps (%)                   0
UL TP < 10 Kbps (%)                  0
10 Kbps < UL TP < 50 Kbps (%)        0
50 Kbps < UL TP < 300 Kbps (%)       0
UL TP > 300 Kbps (%)                 0
Activity Duration DL (ms)            0
Activity Duration UL (ms)            0
Dur. (ms)                            0
Handset Manufacturer               571
Handset Type             

In [162]:
#Counts the number of missing values in each row
df.isnull().sum(axis=1)

0         0
1         0
2         0
3         0
4         0
         ..
149995    0
149996    0
149997    0
149998    0
149999    0
Length: 150000, dtype: int64

In [163]:
#Filter rows with more than 17 missing values
df[df.isnull().sum(axis=1) >1]

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,IMSI,MSISDN/Number,IMEI,Last Location Name,Avg RTT DL (ms),...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
254,,2019-04-24 20:08:00,170.0,2019-04-25 01:45:00,170.0,,,,,39.000000,...,8.829710e+06,1477432.0,2186589.0,18113094.0,358896480.0,1749997.0,749642316.0,14010369.0,36140598.0,376306114.0
419,,2019-04-26 08:17:00,774.0,2019-04-27 21:48:00,284.0,,,,,34.000000,...,1.441174e+07,13389932.0,4590492.0,12680887.0,563257029.0,10816837.0,624661261.0,14019168.0,52639663.0,590292556.0
466,,2019-04-23 23:03:00,337.0,2019-04-24 23:03:00,295.0,,,,,113.000000,...,1.163407e+07,3840901.0,22311882.0,14982966.0,60479153.0,15850878.0,99193737.0,8689286.0,45888409.0,111555134.0
481,,2019-04-23 23:14:00,657.0,2019-04-24 23:13:00,716.0,,,,,315.000000,...,1.977520e+07,10133324.0,2167835.0,19492173.0,132421673.0,13140026.0,614341412.0,8114983.0,54606858.0,163112548.0
506,,2019-04-23 23:26:00,183.0,2019-04-24 23:26:00,668.0,,,,,133.000000,...,1.191700e+07,2272143.0,4789329.0,14242655.0,409799140.0,12466608.0,124215915.0,6537272.0,38304204.0,433422214.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148810,,2019-04-29 07:14:00,456.0,2019-04-30 06:23:00,492.0,,,,,109.795706,...,1.560789e+07,15651799.0,7191123.0,9125542.0,515065799.0,14223064.0,602346620.0,1605611.0,45283727.0,549026287.0
149059,,2019-04-29 07:18:00,358.0,2019-04-30 00:11:00,376.0,,,,,109.795706,...,6.873677e+06,14271941.0,13652142.0,18528283.0,606312011.0,14830455.0,545952229.0,1882888.0,54158393.0,634174785.0
149206,,2019-04-29 07:20:00,182.0,2019-04-30 00:24:00,834.0,,,,,109.795706,...,2.211622e+07,18385402.0,10159653.0,4973486.0,437510932.0,15885454.0,209155230.0,8330388.0,51913246.0,480370571.0
149376,,2019-04-29 07:22:00,57.0,2019-04-29 23:12:00,744.0,,,,,119.000000,...,1.711588e+07,20211769.0,3636843.0,20116835.0,620687223.0,254490.0,810923466.0,2158942.0,45964818.0,647579017.0


In [164]:
# Filter rows where 'Start' and 'End' are NaN but 'Dur. (ms)' is not NaN
df = df[df['Start'].isnull() & df['End'].isnull() & df['Dur. (ms)'].notnull()]

