# Contents

<pre></pre>
<font size=+1>
    
1. [Import Data](#Import-Data)<pre></pre>
2. [Merging Dataframes and Groupby](#Merging-Dataframes-and-Groupby)<pre></pre>
3. [Imputing Data](#Imputing-Data)<pre></pre>
4. [API Requests](#API-Requests)<pre></pre>
5. [SQL Queries](#SQL-Queries)<pre></pre>
6. [Logistic Regression Example](#Logistic-Regression-Example)<pre></pre>
7. [Dictionaries Example](#Dictionaries-Example)<pre></pre>
8. [Classes and OOP](#Classes-and-OOP)<pre></pre>
</font>

## Import Data

[Go back to the Table of Contents](#Contents)

### Packages usually want to upload with display settings

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)

### This creates dataframe specifying type of .csv encoding (ISO-8859-1 in this case)
### This also specifies the names for each dataframe column

In [None]:
df = pd.read_csv('Csv_File_Name.csv', encoding='Encoding_Type',  names = ['Index', 'Column'])

### Check type of data

In [None]:
df.dtypes

### Change data type

In [None]:
string_fips_sliced_df['Full FIPS'] = string_fips_sliced_df['Full FIPS'].astype(dtype = int)

### Strip Text

In [None]:
#This removes non numerical values from strings
#Cleaned strings are then assigned to float values

import re
string_money_df = string_money_df.applymap(lambda x: re.sub(r'[^a-zA-Z0-9 -]', '', str(x)))
string_money_df_cleaned = string_money_df.astype(dtype=float)
string_money_df_cleaned.head()

### Count missing values

In [None]:
df['Column_Name'].isna().sum()

In [None]:
#This shows the percent missing values for each  varialble in a dataframe

DF_Percent_Missing = DF_Final_Vars.isna().mean().round(4)

#This shows the variables with highest missing percent missing values first

DF_Percent_Missing.sort_values(ascending=False)

In [None]:
#Drop rows that contain missing values in a specific column

adjusted_df = df['Column_With_Missing_Values'].dropna()

### Changes all cells named 'Cell_ Value' to null values

In [None]:
df = df.replace('Cell_Value' , np.nan)

## Merging Dataframes and Groupby

[Go back to the Table of Contents](#Contents)

In [None]:
#Combine multiple dataframes into one dataframe with side by side columns

Imputed_dataset = pd.concat([imputed_cat_df, imputed_int_df, imputed_numeric_df], axis=1)

In [None]:
#Merging Two Dataframes

#This is dataframe includes target variable and all explanatory variables
df_merged = pd.merge(df1, df2,  how='left', left_on='GIDBG', right_index=True)

In [None]:
#Groupby aggregate on multiple variables

#Aggregate vars of interest by merging column and chemical
#This gives chemical count for unique latitude and longitudes

unique_merging_df = vars_of_interest_df.groupby(['Merging_Column','30. CHEMICAL']).aggregate(sum)

In [None]:
#This shows all variables in a dataframe that have a column name from a list
#Just another way of doing it

DF_Final_Vars.columns[DF_Final_Vars.columns.isin(cat_var_list)]

### Manipulating dataframes

In [None]:
#Creates new dataframe for all houses worth less than 1 million dollars

df_Mill = df.loc[df['price'] < 1000000]
df_Mill.describe()

In [None]:
#If you have a list of column names you would like to keep (If there are many many variables)
#This creates a dataframe of the variables of interest

Final_DF = df[List_Of_Kept_Var_Names]

## Imputing Data
[Go back to the Table of Contents](#Contents)

In [None]:
#impute categorical variables

#This is to calculate median for strings
#Scipy for calculating mode
import scipy
from scipy import stats

#Loop
#I'm Here is to show if loop breaks
for i in range(0, len(cat_var_df.columns)):
        print('im here:', cat_var_df.columns[i])
        imputed_cat_df.iloc[:,i].replace(np.NaN, scipy.stats.mode(imputed_cat_df.iloc[:,i], axis=0, nan_policy='raise')[0][0], inplace=True)      
#impute mode for categorical vars

#check
imputed_cat_df.isna().sum()  

In [None]:
#impute median for integers
imputed_int_df = int_var_df
for i in range(0, len(int_var_df.columns)):
        imputed_int_df.iloc[:,i].replace(np.NaN, imputed_int_df.iloc[:,i].median(), inplace=True)
        
#check
imputed_int_df.isna().sum() 

In [None]:
#impute mean for numerical vars
imputed_numeric_df = numeric_var_df
for i in range(0, len(numeric_var_df.columns)):
        print('im here:', numeric_var_df.columns[i])
        imputed_numeric_df.iloc[:,i].replace(np.NaN, imputed_numeric_df.iloc[:,i].mean(), inplace=True)
        
##Imputed numerical dataframe

#check
imputed_numeric_df.isna().sum()  

## API Requests
[Go back to the Table of Contents](#Contents)

In [None]:
#Pull request from an API example

import requests

url = 'https://geo.fcc.gov/api/census/block/find'
parameters = {"lat": latitude_longitude_df['12. LATITUDE'][0], "lon": latitude_longitude_df['13. LONGITUDE'][0], 'showall':'false'}
req = requests.get(url, parameters)

print(req.status_code)
print(req.headers)
print(req.text)

In [1]:
#This loop Will pull requests for any set of latitudes and longitudes
#Change the length of the loop to go through an entire dataframe
#This loop saves a file every 5000 requests
#File Names will be labeled like this'FIPs' + '_' + str(count) + '.csv'
#Files are updated and overwritten every time the loop is run
#If there is an error in the loop SAVE PROGRESS TO A DIFFERENT FILE NAME

import requests
import datetime

latitude_list = []
longitude_list = []
fips_code_list = []
timestamps_list = []
count = 0 
url = 'https://geo.fcc.gov/api/census/block/find'
for i in range(0, 5000):
    parameters = {"lat": latitude_longitude_df['12. LATITUDE'][i], "lon": latitude_longitude_df['13. LONGITUDE'][i], 'showall':'false'}
    try:
        response = requests.get(url, parameters)
        data = response.json()        
        latitude_list.append(latitude_longitude_df['12. LATITUDE'][i])
        longitude_list.append(latitude_longitude_df['13. LONGITUDE'][i])
        fips_code_list.append(data['results'][0]['block_fips'])
        timestamps_list.append(datetime.datetime.utcnow())
    except:
        print('Error at index:', i)
        fips_code_list.append('Error')
    if ((len(fips_code_list) % 5000) == 0):
        FIPs_API_df = pd.DataFrame({'Latitude': latitude_list, 'Longitude': longitude_list, 'FIPS': fips_code_list, 'API_Time': timestamps_list})
        pd.DataFrame.to_csv(FIPs_API_df, 'FIPs' + '_' + str(count) + '.csv')
        count += 1
        
FIPs_API_df = pd.DataFrame({'Latitude': latitude_list, 'Longitude': longitude_list, 'FIPS': fips_code_list, 'API_Time': timestamps_list})
FIPs_API_df.head()

NameError: name 'latitude_longitude_df' is not defined

In [None]:
#Creates a dataframe pulling information from an api

latitude_list2 = []
longitude_list2 = []
fips_code_list2 = []
timestamps_list = []
count = 0 
url = 'https://geo.fcc.gov/api/census/block/find'
for i in range(25000, 30000):
    parameters = {"lat": latitude_longitude_df['12. LATITUDE'][i], "lon": latitude_longitude_df['13. LONGITUDE'][i], 'showall':'false'}
    try:
        response = requests.get(url, parameters)
        data = response.json()        
        latitude_list2.append(latitude_longitude_df['12. LATITUDE'][i])
        longitude_list2.append(latitude_longitude_df['13. LONGITUDE'][i])
        fips_code_list2.append(data['results'][0]['block_fips'])
        timestamps_list.append(datetime.datetime.utcnow())
    except:
        print('Error at index:', i)
        fips_code_list2.append('Error')
    if ((len(fips_code_list2) % 5000) == 0):
        FIPs_API_df = pd.DataFrame({'Latitude': latitude_list2, 'Longitude': longitude_list2, 'FIPS': fips_code_list2, 'API_Time': timestamps_list})
        pd.DataFrame.to_csv(FIPs_API_df, 'FIPs1987Extra' + '_' + str(count) + '.csv')
        count += 1

In [None]:
##Example of pulling information from SQL Database

import requests
import json

class WeatherGetter(object):
    
    def __init__(self):
        self.secret_key = "d13ab59eaab2f300a54654adcc38d3b4"
        self.berlin_lat = "52.5200"
        self.berlin_long = "13.4050"
        self.url_base = "https://api.darksky.net/forecast"
        self.exclude = 'currently,flags,minutely,hourly,alerts'
        
    def get_weather_data_for_date(self, datetime_string, verbose=True):
        
        year, month, day = self.format_datetime(datetime_string)
        
        datetime = "{}-{}-{}T12:00:00".format(year, month, day)
        full_url = "{}/{}/{},{},{}?exclude={}".format(self.url_base, self.secret_key, 
                                                     self.berlin_lat, self.berlin_long, 
                                                     datetime, self.exclude)
        response = requests.get(full_url)
        if response.status_code == 200:
            if verbose:
                print(response.status_code)
            return response
        else: 
            raise ValueError("Error getting data from DarkSky API: Response Code {}".format(response.status_code))
            
    def was_raining(self, response, verbose=True):
        data = json.loads(response.text)
        daily = data['daily']
        data =  daily['data']
        data = data[0]
        if data['icon'] == 'rain':
            if verbose:
                print(data['icon'])
            return True
        else:
            if verbose:
                print(data['icon'])
            return False
    
    def format_datetime(self, datetime_string):
        year = datetime_string[:4]
        month = datetime_string[5:7]
        day = datetime_string[8:]
        
        return year, month, day
    
    def did_rain_on_date(self, datetime_string):
        
        response = self.get_weather_data_for_date(datetime_string, verbose=False)
        did_rain = self.was_raining(response, verbose=False)
        
        return did_rain
    
    def get_weather_for_all_dates(self, dates_list):
        """Expects input of dates in yyyy-mm-dd format
        
        Returns a dictionary where each date is the key. Rain days have a value of True, all others are False"""
        
        weather_dict = {}
        
        for date in dates_list:
            weather_dict[date] = self.did_rain_on_date(date)
        
        return weather_dict
    
wg = WeatherGetter()
wg.did_rain_on_date('2019-01-03')
import sqlite3
import pandas as pd
import numpy as np

conn = sqlite3.connect('''database.sqlite''')
cur = conn.cursor()
cur.execute('''select * from matches where season = 2011''')

matches = pd.DataFrame(cur.fetchall())
matches.columns = [i[0] for i in cur.description]
matches.head()
# Get number of unique dates for 2011 season, so that we don't have to repeat API calls for the same dates
unique_dates = matches.Date.unique()
print("# of Unique Game Dates in 2011 Season: {}".format(len(unique_dates)))
# Get rain status for each unique date in 2011 season
rain_dates = wg.get_weather_for_all_dates(unique_dates)
rain_dates
# Create boolean column called 'Rain_Game' for the matches dataframe using the dictionary of values. 
rain_game = []
for date in matches.Date:
    rain_game.append(rain_dates[date])

matches['Rain_Game'] = rain_game
matches
## Get all unique teams, and then create a basic data dictionary for each.
## These values will be updated as we go through each match in the matches table. 

all_teams = matches['HomeTeam'].unique()
print("# of Unique Teams: {}".format(len(all_teams)))
teams_data = {}
for team in all_teams:
    data = {'total_matches_2011': 0, 
           'total_wins_2011': 0,
           'total_losses_2011': 0,
           'rain_wins_2011': 0,
           'rain_losses_2011': 0,
           'total_goals_2011': 0}
    teams_data[team] = data
print(len(teams_data))

## SQL Queries
[Go back to the Table of Contents](#Contents)

In [None]:
#SQL
#Your code here; import necessary packages
import sqlite3
import pandas as pd

#Your code here; create the database school.sqlite
conn = sqlite3.Connection('school.sqlite')

#Your code here
cur = conn.cursor()
cur.execute("""CREATE TABLE contactInfo (
                                        userId INTEGER PRIMARY KEY,
                                        firstName TEXT,
                                        lastName TEXT,
                                        role TEXT,
                                        telephone INTEGER,
                                        street TEXT,
                                        city TEXT,
                                        state TEXT,
                                        zipcode TEXT
                                        );
            """)

# Your code to iterate over the contact list and populate the contactInfo table here
for contact in contacts:
    firstName = contact['firstName']
    lastName = contact['lastName']
    role = contact['role']
    telephone  = contact['telephone ']
    street = contact['street']
    city = contact['city']
    state = contact['state']
    zipcode  = contact['zipcode ']
    cur.execute("""INSERT INTO contactInfo (firstName, lastName, role, telephone, street, city, state, zipcode) 
                  VALUES ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');
                """.format(firstName, lastName, role, telephone, street, city, state, zipcode) )
    
Query the Table to Ensure it is populated

# Your code here

cur.execute("""SELECT * FROM contactInfo;""")
df = pd.DataFrame(cur.fetchall())
df.columns = [x[0] for x in cur.description]
df

Commit Your Changes to the Database
Persist your changes by committing them to the database.

#Your code here
conn.commit()

In [None]:
#SQL
Remove Duplicate Entries
An analyst just realized that there is a duplicate entry in the contactInfo table! Find and remove it.

#Your code here; find the duplicate entry
cur.execute("""SELECT firstName, lastName, telephone, COUNT(*) 
               FROM contactInfo
               GROUP BY 1,2,3
               HAVING COUNT(*) > 1;""").fetchall()
[('Jane', 'Evans', 3259909290, 2)]
#Your code here; delete the duplicate entry
cur.execute('''DELETE FROM contactInfo WHERE telephone = 3259909290;''')
<sqlite3.Cursor at 0x112a06500>
#Your code here; check that the duplicate entry was removed.
cur.execute("""SELECT firstName, lastName, telephone, COUNT(*) 
               FROM contactInfo
               GROUP BY 1,2,3
               HAVING COUNT(*) > 1;""").fetchall()

In [None]:
#SQL
Updating an Address
Ed Lyman just moved to 2910 Simpson Avenue York, PA 17403. Update his address accordingly.

#Your code here; update Ed's address
cur.execute('''UPDATE contactInfo
               SET street = "2910 Simpson Avenue",
                   city = 'York',
                   state = 'PA',
                   zipcode = '17403'
               WHERE firstName = "Ed" AND lastName = "Lyman";
            ''')
<sqlite3.Cursor at 0x112a06500>
#Your code here; Query the database to ensure the change was made
cur.execute("""SELECT * FROM contactInfo;""")
df = pd.DataFrame(cur.fetchall())
df.columns = [x[0] for x in cur.description]
df

Once again, persist your changes by committing them to the database.

#Your code here
conn.commit()

## Logistic Regression Example
[Go back to the Table of Contents](#Contents)

In [None]:
##Creates awesome correlation matrix using Seaborn
# Needs matplotlib and sns

corr = new_data.corr()
plt.figure(figsize=(20,10))
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.set_context("paper", font_scale=2)
plt.title('Variables of Interest in our Analysis', fontsize=40)
sns.heatmap(new_data.corr(),annot=True, linewidth=.5, cmap="coolwarm", mask=mask)

In [None]:
##Great QQ plot for Regression Analysis

#Residual Plot
residuals = model.resid
fig = sm.qqplot(residuals, stats.t, fit=True, line='45')
plt.show()

In [None]:
##Residual vs. Prediction Plot for Regression using Seaborn

sns.set(style='ticks')
sns.regplot(Y_test, y=predictions, scatter_kws={'alpha':0.05});
plt.title('Prediction Performance for House Price, < $1 million, 5-folds')
plt.ylabel('Test Data')
plt.xlabel('Trained Regression')

In [None]:
#Logistic Regression Implementation

#Data has been prepped
#Begin fitting regression model
import statsmodels.api as sm
import sklearn.linear_model 
from sklearn.linear_model import LogisticRegression
#create an instance and fit the model 
logmodel = LogisticRegression()
sk_res = logmodel.fit(x_train, y_train)
sk_predictions = logmodel.predict(x_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, sk_predictions)

## Dictionaries Example
[Go back to the Table of Contents](#Contents)

In [3]:
#Make a pizza input and dictionary
p = int (input ("Enter the no. of pizzas you want to buy (max 3): "))
t = int (input ("Enter the toppings you would like in each pizza (max 4): "))
b = 1
dict_pizza = {}
for _ in range(p):
    pizza = ""
    pizza = str(input(f"\nEnter the flavor of Pizza No. {b}: "))
    dict_pizza[pizza] = []
    b += 1
    c = 1
    for _ in range(t):    
        topping = str(input(f"Enter the flavor of Topping No. {c}: "))
        dict_pizza[pizza].append(topping)
        c += 1
        
print(dict_pizza)

Enter the no. of pizzas you want to buy (max 3): 1
Enter the toppings you would like in each pizza (max 4): 1

Enter the flavor of Pizza No. 1: cheese
Enter the flavor of Topping No. 1: cheese
{'cheese': ['cheese']}


In [15]:
#Final
#Here is what your program would look like with all the changes

import os

available_pizzas = ['margarita', 'pollo', '4cheese', 'bolognese', 'vegetarian']
available_toppings = ['mushroom', 'onions', 'green pepper', 'extra cheese']
pizza_prices = {'margarita': 5, 'pollo': 7, '4cheese': 6, 'bolognese': 8, 'vegetarian': 6.5}
topping_prices = {'mushroom':1, 'onions': 2, 'green pepper':3, 'extra cheese':4}

def ShowMenu():
    os.system('cls')
    print("Available Pizzas:\n")
    print(*available_pizzas,sep = ', ')
    print("\n\nAvailable Topings:\n")
    print(*available_toppings,sep = ', ')
    print('\n\n')

def TakeOrderInput():
    os.system('cls')
    print("Hi, welcome to our text based pizza ordering")
    ordering = True
    while ordering:
        os.system('cls')
        ShowMenu()
        pizza = input("Please choose a pizza: ")
        if pizza not in available_pizzas:
            print(f"I am sorry, we currently do not have {pizza}\n.")
            os.system('pause')
            continue
        topping = input("Please choose a topping: ")
        if topping not in available_toppings:
            print(f"I am sorry, we currently do not have {topping}\n.")
            os.system('pause')
            continue

        print(f"Final order: {pizza} with topping {topping}: ")
        ordering = False

    return pizza,topping

class Order:
    def __init__(self):
        taxes = 0 #You can add taxes
        pizza,topping = TakeOrderInput()
        self.type = pizza
        self.topping = topping
        self.PizzaPrice = pizza_prices[pizza]
        self.ToppingPrice = topping_prices[topping]
        self.Total = self.PizzaPrice + self.ToppingPrice


choice = True
orders = []
orderchoice = input("Welcome! Would you like to order ? (y/n): ")
if orderchoice == 'n':
    print("Have a nice day!")
else:
    while choice:
        neworder = Order()
        orders.append(neworder)
        newchoice = input("Would you like to order again? (y/n): ")
        if (newchoice) == 'n':
            break

total = 0
for order in orders:
    total+=order.Total

print("Total: ",total, '$')

Welcome! Would you like to order ? (y/n): y
Hi, welcome to our text based pizza ordering
Available Pizzas:

margarita, pollo, 4cheese, bolognese, vegetarian


Available Topings:

mushroom, onions, green pepper, extra cheese



Please choose a pizza: taco
I am sorry, we currently do not have taco
.
Available Pizzas:

margarita, pollo, 4cheese, bolognese, vegetarian


Available Topings:

mushroom, onions, green pepper, extra cheese



Please choose a pizza: margarita
Please choose a topping: mushroom
Final order: margarita with topping mushroom: 
Would you like to order again? (y/n): n
Total:  6 $


## Classes and OOP
[Go back to the Table of Contents](#Contents)

In [30]:
class Student(object):
    def __init__(self, name, age, gender, level, grades=None):
        self.name = name
        self.age = age
        self.gender = gender
        self.level = level
        self.grades = grades or {}

    def setGrade(self, course, grade):
        self.grades[course] = grade

    def getGrade(self, course):
        return self.grades[course]

    def getGPA(self):
        return sum(self.grades.values())/len(self.grades)

# Define some students
john = Student("John", 12, "male", 6, {"math":3.3})
jane = Student("Jane", 12, "female", 6, {"math":3.5})

# Now we can get to the grades easily
print(john.getGPA())
print(jane.getGPA())

3.3
3.5


In [19]:
#Class Inheritance

# parent class
class Bird:
    
    def __init__(self):
        print("Bird is ready")

    def whoisThis(self):
        print("Bird")

    def swim(self):
        print("Swim faster")

# child class
class Penguin(Bird):

    def __init__(self):
        # call super() function
        super().__init__()
        print("Penguin is ready")

    def whoisThis(self):
        print("Penguin")

    def run(self):
        print("Run faster")

peggy = Penguin()
peggy.whoisThis()
peggy.swim()
peggy.run()

Bird is ready
Penguin is ready
Penguin
Swim faster
Run faster


In [20]:
def main():
    '''
    Creating Dictionaries with string as key and int as value
    '''                                  
    wordFrequency = {
        "Hello" : 7,
        "hi" : 10,
        "there" : 45,
        "at" : 23,
        "this" : 77
        }
    '''
    Iterate over the dictionary using for loop
    '''
    for key in wordFrequency:
        value = wordFrequency[key]
        print(key, " :: ", value)
    
    print("**************")    
    
    '''
    Iterate over the dictionary using items()
    '''    
    for key , value in wordFrequency.items():
        print(key, " :: ", value)    
    # Take a dictionary view 
    dictView =  wordFrequency.items()
    
    print("Dictionary View before modification : ", dictView, sep ="\n")
    
    # Modify the dictionary
    wordFrequency["hi"] = 90
    
    print("Dictionary View after modification : ", dictView, sep ="\n")
        
if __name__ == "__main__":
    main()

Hello  ::  7
hi  ::  10
there  ::  45
at  ::  23
this  ::  77
**************
Hello  ::  7
hi  ::  10
there  ::  45
at  ::  23
this  ::  77
Dictionary View before modification : 
dict_items([('Hello', 7), ('hi', 10), ('there', 45), ('at', 23), ('this', 77)])
Dictionary View after modification : 
dict_items([('Hello', 7), ('hi', 90), ('there', 45), ('at', 23), ('this', 77)])
