## `drop_columns_csv.py`
1. Read the csv file from the `Output/output01-gnoc_networks-formatted.csv` into pandas dataframe
2. Drop the irrelevant columns from the dataframe
3. Drop the rows where either City or country name is missing 
3. Write file to `Output/output02-gnoc_networks-colsdropped.csv`

In [1]:
import pandas as pd

In [2]:
import logging
  
#Create and configure logger 
logging.basicConfig(filename="Logs/IP2GeoPython.log", 
                    format='%(asctime)s %(message)s', 
                    filemode='a+') 
  
#Creating an object 
logger=logging.getLogger() 
  
#Setting the threshold of logger to DEBUG 
logger.setLevel(logging.INFO) 
  
#Test messages 
#logger.debug("debug") 
#logger.info("Info") 
#logger.warning("Warning") 
#logger.error("error") 
#logger.critical("critical") 

In [3]:
logger.info("Reading Output/output01-gnoc_networks-formatted.csv")

In [4]:
df = pd.read_csv('Output/output01-gnoc_networks-formatted.csv',encoding='utf8', sep=';')

In [5]:
df.head()

Unnamed: 0,network,City,Country,ATON_Building_ID,CRES_Building_ID,Building,File_Name,Location,Region,Site,Support_Group,comments
0,10.100.131.0/27,Mumbai,India,394,IN041,Gigaplex,IN-NAV-AIROLIGIGAPLEX,AIROLI OTHERS,APAC,Mumbai,Group IT,ITICS VLAN-VLAN 5
1,10.100.131.160/27,Amsterdam,Netherlands,Datacenter,IN041,Datacenter,IN-NAV-AIROLIGIGAPLEX,DCA,Northern Europe,Mumbai,Group IT,
2,10.100.132.0/22,Mumbai,India,394,IN041,Gigaplex,IN-NAV-AIROLIGIGAPLEX,AIROLI OTHERS,APAC,Mumbai,Group IT,IPT 7th Floor VLAN-VLAN 10
3,10.100.136.0/23,Mumbai,India,394,IN041,Gigaplex,IN-NAV-AIROLIGIGAPLEX,AIROLI OTHERS,APAC,Mumbai,Group IT,IPT 8th Floor VLAN-VLAN 80
4,10.100.138.0/23,Mumbai,India,394,IN041,Gigaplex,IN-NAV-AIROLIGIGAPLEX,AIROLI OTHERS,APAC,Mumbai,Group IT,IPT 6th Floor VLAN-VLAN 90


In [6]:
list(df.columns)

['network',
 'City',
 'Country',
 'ATON_Building_ID',
 'CRES_Building_ID',
 'Building',
 'File_Name',
 'Location',
 'Region',
 'Site',
 'Support_Group',
 'comments']

In [7]:
del_list_powershell = ['Site','Region',
        'Location','Building',
            'File_Name','Subnet type',
            'Support_Group','Subnet Category',
            'IP Address Type','ATON_Building_ID',
            'CRES_Building_ID','comments ']


del_list =  [
 'ATON_Building_ID',
 'CRES_Building_ID',
 'Building',
 'File_Name',
 'Location',
 'Region',
 'Site',
 'Support_Group',
 'comments']


df.drop(del_list, axis=1, inplace=True)



In [8]:
df.head()

Unnamed: 0,network,City,Country
0,10.100.131.0/27,Mumbai,India
1,10.100.131.160/27,Amsterdam,Netherlands
2,10.100.132.0/22,Mumbai,India
3,10.100.136.0/23,Mumbai,India
4,10.100.138.0/23,Mumbai,India


In [9]:
df.columns = ['Network', 'City', 'Country']
df.head()

Unnamed: 0,Network,City,Country
0,10.100.131.0/27,Mumbai,India
1,10.100.131.160/27,Amsterdam,Netherlands
2,10.100.132.0/22,Mumbai,India
3,10.100.136.0/23,Mumbai,India
4,10.100.138.0/23,Mumbai,India


In [10]:
df_dropped = df[(df['City'].isna() | df['Country'].isna())]
df_dropped.head()

Unnamed: 0,Network,City,Country
37,10.48.147.96/27,,
42,10.48.177.16/28,,
43,10.48.182.128/26,,
44,10.48.190.0/27,,
45,10.48.64.0/28,,


In [11]:
df_dropped.to_csv("Output/dropped01-gnoc_networks-nullcitycountry.csv", index=False)

In [12]:
def remove_city_with_delim(x):
    if( ',' in str(x)):
        return str(x).split(',')[0].strip()
    else:
        return x


df['City'] = df['City'].apply(remove_city_with_delim)

In [13]:
df.head()

Unnamed: 0,Network,City,Country
0,10.100.131.0/27,Mumbai,India
1,10.100.131.160/27,Amsterdam,Netherlands
2,10.100.132.0/22,Mumbai,India
3,10.100.136.0/23,Mumbai,India
4,10.100.138.0/23,Mumbai,India


In [14]:
# Number of lines where Country Names are missing
null_country_entry = len(df['Country'])-len(df['Country'].dropna())

logger.warning("Found "+str(null_country_entry)+" entries with NULL COUNTRY NAMES") 



null_city_entry    = len(df['City'])-len(df['City'].dropna())

logger.warning("Found "+str(null_city_entry)+" entries with NULL CITY NAMES")



null_country_and_city_entry = len(df) - len(df.dropna(subset=['City','Country'], thresh=1))

logger.warning("Found "+str(null_country_and_city_entry)+" entries with either NULL CITY & COUNTRY NAMES")


logger.warning("Dropping rows with either NULL CITY & COUNTRY NAMES")

df.dropna(subset=['City','Country'], thresh=1, inplace=True)


logger.warning("Number of remaining valid rows in dataframe : "+str(len(df)-null_country_and_city_entry))


df.to_csv("Output/output02-gnoc_networks-colsdropped.csv", sep=';', index=False)

logger.warning("CSV generated: Output/output02-gnoc_networks-colsdropped.csv ")

