# `GEOIPS API`

In [1]:
import os, time, threading, requests
import pandas as pd
import numpy as np

pd.set_option('notebook_repr_html', True)
pd.set_option('display.mpl_style', 'default')

# Locate API key for the geoips application - www.geoips.com
f_entr = open(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'geoips_enterp_key.txt'), 'r')
f_basc = open(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'geoips_basic_key.txt'), 'r')

geoips_key = f_entr.read()
# geoips_key = f_basc.read()


In [5]:
process_with_geoips = pd.read_csv("../virus_malware_analytics/for_geoips.csv", index_col='IP_Address')
print("Number of rows : {:,}".format(process_with_geoips.index.size))
process_with_geoips.head(3)

Number of rows : 200,000


Unnamed: 0_level_0,Unnamed: 0,IP_Class
IP_Address,Unnamed: 1_level_1,Unnamed: 2_level_1
194.106.219.10,37500,Class C
204.188.237.10,37501,Class C
116.255.130.109,37502,Class A


In [6]:
api_daily_limit = 9500

# Divide based on number of api daily call limits
tot_api_calls = process_with_geoips.index.size / api_daily_limit
tot_files = pd.Series(pd.date_range(start=pd.datetime.today(), periods=tot_api_calls, freq='D'))

# Divide into filenames based on dates
inp_filenames = ["geoips_{0}.csv".format(tm) for tm in tot_files.apply(lambda x: x.day).values]
print(inp_filenames)

['geoips_5.csv', 'geoips_6.csv', 'geoips_7.csv', 'geoips_8.csv', 'geoips_9.csv', 'geoips_10.csv', 'geoips_11.csv', 'geoips_12.csv', 'geoips_13.csv', 'geoips_14.csv', 'geoips_15.csv', 'geoips_16.csv', 'geoips_17.csv', 'geoips_18.csv', 'geoips_19.csv', 'geoips_20.csv', 'geoips_21.csv', 'geoips_22.csv', 'geoips_23.csv', 'geoips_24.csv', 'geoips_25.csv']


## Create the `input .csv files` using dates as pointers

In [7]:

start_point = 0
end_point = api_daily_limit

for csvf in inp_filenames:
    dataset = process_with_geoips[start_point:end_point]
    dataset.to_csv(os.path.join(os.path.join(os.getcwd(), 'cron_data', 'geoips', 'input'), csvf))
    
    # Update pointers
    start_point = end_point
    end_point += api_daily_limit



## Function to extract ip geodata from `geoips.com`

In [8]:

gips_base_url = "http://api.geoips.com/ip"
append_url = "output/json/hostname/true/timezone/true"


def extract_from_geoips(df):
    """ Function to extract IP geodata from geoips.com """
    for ip in df.index.values:
        geoips_url = "{0}/{1}/key/{2}/{3}".format(gips_base_url, ip, geoips_key, append_url)
        r = requests.get(geoips_url)
        
        if r.ok:
            try:
                r.json()
            # We CANNOT parse json due to html error. Return existing data.
            except ValueError:
                api_stopped_at.append(ip)
                ip_location_data = dict(ip=ip)
                ip_location_data.update(df.ix[ip].to_dict())
                extracted_data.append(ip_location_data)
                continue

            # GET request OK, AND we can parse returned json. Get new data.
            else: 
                if 'error' not in r.json():
                    returned_data = r.json()['response']

                    # We parse json BUT no entry was found for IP
                    if '200_1' not in returned_data['code']:
                        ip_location_data = dict(ip=ip)
                        ip_location_data.update(df.ix[ip].to_dict())
                        extracted_data.append(ip_location_data)
                        continue

                    else:  # We parse json AND VALID entry found for IP
                        ip_location_data = returned_data['location']
                        ip_location_data.update(df.ix[ip].to_dict())
                        extracted_data.append(ip_location_data)
                        continue

                else: # GET request OK BUT error in parsed json. Return already known data
                    api_stopped_at.append(ip)
                    ip_location_data = dict(ip=ip)
                    ip_location_data.update(df.ix[ip].to_dict())
                    extracted_data.append(ip_location_data)
                    continue

        else: # GET request NOT OK. It returns an error. Return already known data
            api_stopped_at.append(ip)
            ip_location_data = dict(ip=ip)
            ip_location_data.update(df.ix[ip].to_dict())
            extracted_data.append(ip_location_data)
            continue 

    # After the FOR LOOP. Return the list of json as dataframe
    fresh_fetched_data = pd.DataFrame(extracted_data)
    fresh_fetched_data = fresh_fetched_data.set_index(fresh_fetched_data['Unnamed: 0'])
    fresh_fetched_data.drop('Unnamed: 0', inplace=True, axis=1)
    
    return pd.DataFrame(fresh_fetched_data)


# `DAILY CODE` to fetch `9,500` geoIPs data using API service

#### Read a .csv file, process with `extract_from_geoips(df)` and save the returned df as .csv

In [10]:
len(api_stopped_at)

In [510]:
len(extracted_data)


7947

In [512]:
pd.DataFrame(extracted_data).tail()

Unnamed: 0.1,IP_Class,Unnamed: 0,city_name,continent_code,continent_name,country_code,country_name,county_name,hostname,ip,latitude,longitude,owner,region_code,region_name,timezone
7942,Class A,111942,SANTA CLARA,,NORTH AMERICA,US,UNITED STATES,SANTA CLARA,74-82-169-73.genericreverse.com,74.82.169.73,37.3524,-121.9584,TAKE 2 HOSTING INC.,CA,CALIFORNIA,PST
7943,Class C,111943,RIOJA,EU,EUROPE,ES,SPAIN,ANDALUCIA,llgb376.servidoresdns.net,217.76.130.26,36.95,-2.45,ARSYS.ES,AL,ALMERIA,GMT+1
7944,Class A,111944,,ME,MIDDLE EAST,EG,EGYPT,,host-41.234.97.4.tedata.net,41.234.97.4,27.0,30.0,,,,GMT+2
7945,Class A,111945,RIYADH,ME,MIDDLE EAST,SA,SAUDI ARABIA,,unknown,86.51.204.51,24.64,46.77,BROADBAND IP RANGE,,,GMT+3
7946,Class C,111946,TUNIS,AF,AFRICA,TN,TUNISIA,,unknown,197.0.189.197,36.8,10.18,ORGANISATION,,,GMT+1


In [514]:
dataset.ix[dataset['Unnamed: 0'] == 111947]

Unnamed: 0_level_0,Unnamed: 0,IP_Class
IP_Address,Unnamed: 1_level_1,Unnamed: 2_level_1
88.241.98.22,111947,Class A


## Use only at first run

In [11]:
api_stopped_at = []

extracted_data = []


## Continue

In [206]:


# Log start time
start_time = pd.datetime.today()

file_tofetch = "geoips_{0}.csv".format(pd.datetime.today().day)
fetched_data = "out_geoips_{0}.csv".format(pd.datetime.today().day)

f_path = os.path.join(os.getcwd(), 'cron_data', 'geoips', 'input')
dataset = pd.read_csv(os.path.join(f_path, file_tofetch), index_col='IP_Address')



# Call function to extract geoips data on the dataframe
geoips_extract = extract_from_geoips(dataset)


# Save fetched file as .csv
fout_path = os.path.join(os.getcwd(), 'cron_data', 'geoips', 'output')
geoips_extract.to_csv(os.path.join(fout_path, fetched_data))


# Log finish time and check the difference
end_time = pd.datetime.today()
elapsed = (end_time - start_time).total_seconds()


print("\n\n %s - API data extraction completed and saved as .csv! \t *** %d of %d *** in %.2f mins \n"
      %(file_tofetch,inp_filenames.index(file_tofetch)+1, len(inp_filenames), elapsed/60))


# Print sample output
geoips_extract.head(3)

# `TEST`

In [178]:

ipx = '199.27.135.79'

gips_base_url = "http://api.geoips.com/ip"
append_url = "output/json/hostname/true/timezone/true"
geoips_url = "{0}/{1}/key/{2}/{3}".format(gips_base_url, ipx, geoips_key, append_url)

r = requests.get(geoips_url)

r.content

b'<div style="border:1px solid #990000;padding-left:20px;margin:0 0 10px 0;">\n\n<h4>A PHP Error was encountered</h4>\n\n<p>Severity: Notice</p>\n<p>Message:  Undefined index: target</p>\n<p>Filename: controllers/query.php</p>\n<p>Line Number: 154</p>\n\n</div>{\n"response": {\n"status": "Propper Request",\n"message": "Success",\n"notes": "The following results has been returned",\n"code": "200_1",\n"location": {\n"ip": "199.27.135.79",\n"hostname": "",\n"owner": "IPV4 ADDRESS BLOCK NOT MANAGED BY THE RIPE NCC",\n"continent_name": "NORTH AMERICA",\n"continent_code": "NA",\n"country_name": "UNITED STATES",\n"country_code": "US",\n"region_name": "CALIFORNIA",\n"region_code": "CA",\n"county_name": "SAN FRANCISCO SAN MATEO",\n"city_name": "SAN FRANCISCO",\n"latitude": "37.7753",\n"longitude": "-122.4186",\n"timezone": "PST"\n},\n"unit_test": {\n"api_calls": "10000",\n"current_calls": "2505",\n"elapsed_time": "0.0276",\n"memory_usage": "1.9MB"\n}\n}\n}'

# Join all the `geoips` dataset together into one dataframe

In [168]:
pieces = []
f_path = os.path.join(os.getcwd(), 'cron_data', 'geoips', 'input')

for k in np.arange(1,10):
    file_tofetch = "out_geoips_{}.csv".format(k)
    dataset = pd.read_csv(os.path.join(f_path, file_tofetch), index_col=0)
    pieces.append(dataset)

# Concatenate the list of files into single dataframe
geoips_data = pd.concat(pieces)

# Get the rows with missing update/data attributes
missing_data = geoips_data[geoips_data['country_name'].isnull()]
print( "\n\nMissing data: {} out of {:,} \n".format(missing_data.index.size, geoips_data.index.size))
missing_data.head()

# geoips_data = geoips_data.head(100)



Missing data: 975 out of 76,000 



Unnamed: 0,IP_Class,city_name,continent_code,continent_name,country_code,country_name,county_name,hostname,ip,latitude,longitude,owner,region_code,region_name,timezone
78424,Class A,,,,,,,92-61-154-16.static.servage.net,92.61.154.16,0.0,0.0,SERVAGE.NET - VPS SEGMENT 2,,,
78428,Class A,,,,,,,unknown,10.89.234.76,0.0,0.0,NOT ALLOCATED,,,
78438,Class C,,,,,,,,199.27.135.79,,,,,,
78445,Class C,,,,,,,unknown,192.168.10.11,0.0,0.0,NOT ALLOCATED,,,
78584,Class A,,,,,,,,92.53.113.85,,,,,,


# Function to update missing data `GEOIPS` API Call using another API service

In [169]:
def update_missing_data():
    for ipx in geoips_data[geoips_data['country_name'].isnull()].ip.values:
        r = requests.get("http://freegeoip.net/json/"+ipx)
        if r.ok:
            js_data = r.json()
#             geoips_data.loc[ipx,'ip'] = js_data['ip']
            geoips_data.loc[geoips_data.ip==ipx,'city_name'] = js_data['city']
            geoips_data.loc[geoips_data.ip==ipx,'country_code'] = js_data['country_code']
            geoips_data.loc[geoips_data.ip==ipx,'country_name'] = js_data['country_name']
            geoips_data.loc[geoips_data.ip==ipx,'latitude'] = js_data['latitude']
            geoips_data.loc[geoips_data.ip==ipx,'longitude'] = js_data['longitude']
            geoips_data.loc[geoips_data.ip==ipx,'region_code'] = js_data['region_code']
            geoips_data.loc[geoips_data.ip==ipx,'region_name'] = js_data['region_name']
            geoips_data.loc[geoips_data.ip==ipx,'timezone'] = js_data['time_zone']
            geoips_data.loc[geoips_data.ip==ipx,'longitude'] = js_data['longitude']
    
    return ("\n\nCompleted!\n\n")
        

    
# Call function to update `geoips_data` with `update_missing_data`
update_missing_data()

geoips_data.head(3)



Completed!


None


# Combine the separate Freegeoip.net data into single csv file

In [352]:
pieces = []
f_path = os.path.join(os.getcwd(), 'cron_data', 'output')

for k in np.arange(1,59):
    file_tofetch = "out_free_gip_{}.csv".format(k)
    data_frame = pd.read_csv(os.path.join(f_path, file_tofetch), index_col=0)
    pieces.append(data_frame)

    
# Concatenate the list of files into single dataframe
freegeoip_data = pd.concat(pieces)

# Clean up the files

In [376]:

# Rename the column names
freegeoip_data.rename(columns={'city':'city_name', 'time_zone':'timezone'}, inplace=True)

# Import existing IP dataset for IPClass attributes
uniqueIPs = pd.read_csv(os.path.join(os.getcwd(), 'csv_dataset', "uniqueIPs.csv"), index_col=0)

mgd_data = pd.merge(freegeoip_data, uniqueIPs, left_on='ip', right_on='IP_Address')
mgd_data.drop('IP_Address', axis=1, inplace=True)
mgd_data.head(1)

Unnamed: 0,city_name,country_code,country_name,ip,latitude,longitude,metro_code,region_code,region_name,timezone,zip_code,IP_Class
0,Ashburn,US,United States,174.140.154.23,39.0437,-77.4875,511,VA,Virginia,America/New_York,20146,Class B


In [377]:
geoips_data.head(1)

Unnamed: 0,IP_Class,city_name,continent_code,continent_name,country_code,country_name,county_name,hostname,ip,latitude,longitude,owner,region_code,region_name,timezone
78000,Class A,BEIJING,AS,ASIA,CN,CHINA,,unknown,111.3.42.65,39.9,116.41,CHINA MOBILE COMMUNICATIONS CORPORATION,BJ,,GMT+8


# Combine both Freegeoip.net and geoips.com API dataset into 1 csv file

In [482]:

# Combine all
augmented_ip_geodata = geoips_data.append(mgd_data, ignore_index=True)
augmented_ip_geodata.columns = augmented_ip_geodata.columns.str.upper()
change_type(augmented_ip_geodata).to_csv(os.path.join(os.getcwd(), 'csv_dataset', 'augmented_ip_geodata.csv'))

augmented_ip_geodata.head(3)

Unnamed: 0,IP_CLASS,CITY_NAME,CONTINENT_CODE,CONTINENT_NAME,COUNTRY_CODE,COUNTRY_NAME,COUNTY_NAME,HOSTNAME,IP,LATITUDE,LONGITUDE,METRO_CODE,OWNER,REGION_CODE,REGION_NAME,TIMEZONE,ZIP_CODE
0,CLASS A,BEIJING,AS,ASIA,CN,CHINA,,UNKNOWN,111.3.42.65,39.9,116.41,,CHINA MOBILE COMMUNICATIONS CORPORATION,BJ,,GMT+8,
1,CLASS A,ENID,,NORTH AMERICA,US,UNITED STATES,GARFIELD,C74-197-189-185.DH.SUDDENLINK.NET,74.197.189.185,36.3957,-97.8784,,SUDDENLINK COMMUNICATIONS,OK,OKLAHOMA,CST,
2,CLASS A,NANNING,AS,ASIA,CN,CHINA,,UNKNOWN,116.252.23.146,22.82,108.32,,CHINANET GUANGXI PROVINCE NETWORK,GX,,GMT+8,


# Extracting additional information about `Countries` from `geoips.com`

### Transform each entry to uppercase for easy comparison

In [5]:
def change_type(df):
    df.columns = df.columns.str.upper()
    for col in df.columns:
        if df[col].dtypes == 'object':
            df[col] = df[col].str.upper()
    return df


In [9]:
data_url = 'http://www.stat.ubc.ca/~jenny/notOcto/STAT545A/examples/gapminder/data/gapminderDataFiveYear.txt'
dframe = pd.read_csv(data_url, sep='\t')
# dframe.drop(['year','pop', 'lifeExp', 'gdpPercap'], axis=1, inplace=True)
# dframe.columns = ['COUNTRY_NAME', 'CONTINENT_NAME']
dframe = change_type(dframe)
dframe.head()

Unnamed: 0,COUNTRY,YEAR,POP,CONTINENT,LIFEEXP,GDPPERCAP
0,AFGHANISTAN,1952,8425333,ASIA,28.801,779.445314
1,AFGHANISTAN,1957,9240934,ASIA,30.332,820.85303
2,AFGHANISTAN,1962,10267083,ASIA,31.997,853.10071
3,AFGHANISTAN,1967,11537966,ASIA,34.02,836.197138
4,AFGHANISTAN,1972,13079460,ASIA,36.088,739.981106


In [56]:
country_list = pd.read_table("http://nbviewer.ipython.org/github/cs109/2014_data/blob/master/countries.csv", sep=',')
country_list.columns = ['Country name', 'Region']
print("\nNumber of rows ",  country_list.index.size)
change_type(country_list).head(2)



Number of rows  194


Unnamed: 0,Country name,Region
0,ALGERIA,AFRICA
1,ANGOLA,AFRICA


In [58]:
df = countries.combine_first(country_list)
df.drop('Region', inplace=True, axis=1)

country_continent = pd.merge(df, country_list, right_on='Country name', left_on='Country name', how='inner')
country_continent.columns = country_continent.columns.str.upper()

change_type(country_continent).to_csv(os.path.join(os.getcwd(),'csv_dataset','augmented_countries_profile.csv'))
print("\nNumber of rows ",  country_continent.index.size)
country_continent.tail()


Number of rows  175


Unnamed: 0,AREA (KM2),CAPITAL,CODE,COUNTRY NAME,POPULATION,REGION
170,447400,TASHKENT,UZ,UZBEKISTAN,27865738,ASIA
171,12200,PORT VILA,VU,VANUATU,221552,OCEANIA
172,527970,SANAA,YE,YEMEN,23495361,ASIA
173,752614,LUSAKA,ZM,ZAMBIA,13460305,AFRICA
174,390580,HARARE,ZW,ZIMBABWE,11651858,AFRICA
