The function below (factor_location_data) takes in the dataframe after the regression has been performed, and will scale
the ouput variable according to the users selected location preference


Mode Options:
- 1: Location Does Not Matter (return the same df back)
- 2: Location DOES Matter, Given an n-mile radius around ZIP_CODE, set all scores outside of that radius = 0
- 3: Location DOES Matter, Use a logistic weighting fn to weight scores appropriately based on their log(distance) to current ZIP_CODE
- MAYBE -> 4: Location DOES Matter, Give User Option To Select From List of 50 States (+ DC & PR) Of which are allowable

In [22]:
import pandas as pd
import missingno as msno
import numpy as np
import math



In [23]:
def factor_location_data(df, location_mode):
    import geopy.distance
    from geopy.geocoders import Nominatim
    geolocator = Nominatim(user_agent='myapplication')
    
    def get_current_location():
        print('\nPlease Enter Your City And State Abbrev Separated By A Comma')
        my_loc = input()
        loc_list = my_loc.split(',')
        my_city = loc_list[0]
        my_state = loc_list[-1]
        my_state = my_state.strip()
        
        from geopy.geocoders import Nominatim
        
        location = geolocator.geocode(f"{my_city} {my_state}")
    
        my_lon = location.raw['lon']
        my_lat = location.raw['lat']
        
        return my_lon, my_lat
    
    def sigmoid(x):
        return 1 / (1 + math.exp(-x))

    if location_mode == 1:
        pass
    elif location_mode == 2:
        print('\nPlease Select Your n-Mile Radius')
        n = input()
        n = int(n)
        
        (current_lon, current_lat) = get_current_location()
        # Cutoff after n-miles
        geo_df = pd.read_csv('../data/Interim/College_GeoData.csv')
        try:
            geo_df = geo_df.drop('Unnamed: 0',axis=1)
        except:
            pass
        df = df.merge(geo_df,how='inner',on='C_ID')
        df['Current_Lon'] = current_lon
        df['Current_Lat'] = current_lat
        df['Distance_Miles'] = 0
        
        for idx,row in df.iterrows():
            coords_1 = (row['Current_Lat'],row['Current_Lon'])
            coords_2 = (row['Latitude'],row['Longitude'])

            my_dist = geopy.distance.geodesic(coords_1, coords_2).miles
            
            df.at[idx,'Distance_Miles'] = my_dist
        
        df = df[df['Distance_Miles']<=n]
        
        df = df.drop(['Distance_Miles','Current_Lon','Current_Lat','Longitude','Latitude'],axis=1)
        df = df.reset_index()
        try:
            df = df.drop('index',axis=1)
        except:
            pass
        
    elif location_mode == 3:
        (current_lon, current_lat) = get_current_location()
        
        # weight on logistic curve
        geo_df = pd.read_csv('../data/Interim/College_GeoData.csv')
        try:
            geo_df = geo_df.drop('Unnamed: 0',axis=1)
        except:
            pass
        df = df.merge(geo_df,how='inner',on='C_ID')
        df['Current_Lon'] = current_lon
        df['Current_Lat'] = current_lat
        df['Distance_Miles'] = 0
        
        for idx,row in df.iterrows():
            coords_1 = (row['Current_Lat'],row['Current_Lon'])
            coords_2 = (row['Latitude'],row['Longitude'])

            my_dist = geopy.distance.geodesic(coords_1, coords_2).miles
            
            df.at[idx,'Distance_Miles'] = my_dist
            
        target_col_name = 'Pred_Pts'
        # Min Max it, -0.5, multiply by 12 (-6,6) domain
        df['Distance_Miles_Scaled'] = (df['Distance_Miles'] / df['Distance_Miles'].abs().max()) #domain now 0,1
        df['Distance_Miles_Scaled'] = df['Distance_Miles_Scaled'] - 0.5 #domain now -0.5, 0.5
        df['Distance_Miles_Scaled'] = df['Distance_Miles_Scaled'] * 12 # domain now -6,6
        
        df['scaler'] = 0.0
        for idx,row in df.iterrows():
            my_sig = sigmoid(row['Distance_Miles_Scaled'])
            df.at[idx,'scaler'] = my_sig #closest schools will have low sigmoid value (so divide by it to rank them higher)
            
        for idx,row in df.iterrows(): #capping scalar from 0.3,0.7
            my_scalar = row['scaler']
            new_scale = my_scalar
            if my_scalar > 0.7:
                new_scale = 0.7
            elif my_scalar < 0.3:
                new_scale = 0.3
            
            df.at[idx,'scaler'] = new_scale
            
        
        df[target_col_name] = df[target_col_name] / df['scaler']

        df = df.drop(['Distance_Miles','Current_Lon','Current_Lat','Longitude','Latitude','scaler',
                      'Distance_Miles_Scaled'],axis=1)
        df = df.reset_index()
        try:
            df = df.drop('index',axis=1)
        except:
            pass
        
    elif location_mode == 4:
        # Only Keep Certain States
        print('Enter The Series of State Codes Allowed Separated By Spaces')
        state_series = input()
        state_codes = state_series.split(' ')
        final_sc = []
        verified_sc = ['AL','AK','AZ','AR','CA',
                       'CO','CT','DE','FL','GA',
                       'HI','ID','IL','IN','IA',
                       'KS','KY','LA','ME','MD',
                       'MA','MI','MN','MS','MO',
                       'MT','NE','NV','NH','NJ',
                       'NM','NY','NC','ND','OH',
                       'OK','OR','PA','RI','SC',
                       'SD','TN','TX','UT','VT',
                       'VA','WA','WV','WI','WY',
                       'DC','PR']
        for item in state_codes:
            if item in verified_sc:
                final_sc.append(item)
    
        geo_df = pd.read_csv('../data/Interim/College_GeoData.csv')
        try:
            geo_df = geo_df.drop('Unnamed: 0',axis=1)
        except:
            pass
        df = df.merge(geo_df,how='inner',on='C_ID')
        
        df['KEEP'] = False
        for idx,row in df.iterrows():
            if row['State'] in final_sc:
                df.at[idx,'KEEP'] = True
        
        df = df[df['KEEP']==True]
        df = df.drop(['Longitude','Latitude','KEEP'],axis=1)
        df = df.reset_index()
        try:
            df = df.drop('index',axis=1)
        except:
            pass
    
    return df #df now has modified scores according to location_mode

In [24]:
final_df = pd.read_csv('../data/Raw/final_dataset.csv')
final_df = factor_location_data(df = final_df,
                                location_mode = 4)
final_df.head()

Enter The Series of State Codes Allowed Separated By Spaces
TN KY


Unnamed: 0,C_ID,Name,City,State
0,ASBU,Asbury University,Wilmore,KY
1,BELL,Bellarmine University,Louisville,KY
2,BERE,Berea College,Berea,KY
3,BLMT,Belmont University,Nashville,TN
4,CARS,Carson-Newman University,Jefferson City,TN
