In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datascience import Table
import scipy as sp
%matplotlib inline
import statsmodels.formula.api as smf
plt.style.use('fivethirtyeight')

In [3]:
df = pd.read_csv('Family-Income-and-Expenditure.csv')

In [4]:
df.columns

Index(['Total Household Income', 'Region', 'Total Food Expenditure',
       'Main Source of Income', 'Agricultural Household indicator',
       'Bread and Cereals Expenditure', 'Total Rice Expenditure',
       'Meat Expenditure', 'Total Fish and  marine products Expenditure',
       'Fruit Expenditure', 'Vegetables Expenditure',
       'Restaurant and hotels Expenditure', 'Alcoholic Beverages Expenditure',
       'Tobacco Expenditure', 'Clothing, Footwear and Other Wear Expenditure',
       'Housing and water Expenditure', 'Imputed House Rental Value',
       'Medical Care Expenditure', 'Transportation Expenditure',
       'Communication Expenditure', 'Education Expenditure',
       'Miscellaneous Goods and Services Expenditure',
       'Special Occasions Expenditure', 'Crop Farming and Gardening expenses',
       'Total Income from Entrepreneurial Acitivites', 'Household Head Sex',
       'Household Head Age', 'Household Head Marital Status',
       'Household Head Highest Grade Compl

In [5]:
df['Region'].unique()

array(['CAR', 'Caraga', 'VI - Western Visayas', 'V - Bicol Region',
       ' ARMM', 'III - Central Luzon', 'II - Cagayan Valley',
       'IVA - CALABARZON', 'VII - Central Visayas',
       'X - Northern Mindanao', 'XI - Davao Region',
       'VIII - Eastern Visayas', 'I - Ilocos Region', 'NCR',
       'IVB - MIMAROPA', 'XII - SOCCSKSARGEN', 'IX - Zasmboanga Peninsula'], dtype=object)

In [6]:
df['Region'] = df['Region'].apply(lambda x: x.strip())
df['Region'] = df['Region'].str.split().str[:1].str.join('')
df['Region'].unique()

array(['CAR', 'Caraga', 'VI', 'V', 'ARMM', 'III', 'II', 'IVA', 'VII', 'X',
       'XI', 'VIII', 'I', 'NCR', 'IVB', 'XII', 'IX'], dtype=object)

In [7]:
region_to_island = {
    'Luzon': ['I', 'II', 'III', 'IVA', 'IVB', 'V', 'CAR',],
    'NCR': ['NCR'],
    'Visayas': ['VI', 'VII', 'VIII'],
    'Mindanao': ['IX', 'X', 'XI', 'XII', 'Caraga', 'ARMM']
}

df['Island'] = df['Region'].apply(
    lambda x: next(island for island, regions in region_to_island.items() if x in regions))


In [8]:
df = df[['Island', 'Total Household Income', 'Transportation Expenditure']]

In [9]:
df['Percentage'] = df['Transportation Expenditure'] / df['Total Household Income'] * 100

In [10]:
df1 = df.groupby(['Island']).mean()

In [11]:
luzon = df[df['Island'] == 'Luzon']['Percentage']
visayas = df[df['Island'] == 'Visayas']['Percentage']
mindanao = df[df['Island'] == 'Mindanao']['Percentage']
ncr = df[df['Island'] == 'NCR']['Percentage']

In [12]:
def meanBootstrap(x, n):
    data = []
    for i in range(n):
        bs = np.random.choice(x, len(x))
        data = np.append(data, bs)
    return data.mean()

In [13]:
n = 1000
bootstrappedPercentage = [meanBootstrap(luzon, n),
                          meanBootstrap(mindanao, n),
                          meanBootstrap(ncr, n),
                          meanBootstrap(visayas, n)]

df1['Bootstrapped Percentage'] = bootstrappedPercentage

In [14]:
df1

Unnamed: 0_level_0,Total Household Income,Transportation Expenditure,Percentage,Bootstrapped Percentage
Island,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Luzon,257928.90139,12556.461061,4.585537,4.585847
Mindanao,193103.696407,8626.175018,4.227119,4.226382
NCR,420861.861501,22546.104116,5.146213,5.145227
Visayas,218045.029111,9427.509121,4.302561,4.302057


In [15]:
odf = pd.read_csv('Family-Income-and-Expenditure.csv')

In [16]:
vehicle = odf[['Number of Car, Jeep, Van', 
               'Number of Motorcycle/Tricycle',
               'Number of Motorized Banca']]

In [17]:
df = df.join(vehicle, how = 'outer')

In [18]:
df['numbvehicle'] = df['Number of Car, Jeep, Van'] + df['Number of Motorcycle/Tricycle'] + df['Number of Motorized Banca']

In [19]:
df

Unnamed: 0,Island,Total Household Income,Transportation Expenditure,Percentage,"Number of Car, Jeep, Van",Number of Motorcycle/Tricycle,Number of Motorized Banca,numbvehicle
0,Luzon,480332,4776,0.994312,0,1,0,1
1,Luzon,198235,12900,6.507428,0,2,0,2
2,Luzon,82785,324,0.391375,0,0,0,0
3,Luzon,107589,6840,6.357527,0,0,0,0
4,Luzon,189322,6996,3.695292,0,1,0,1
...,...,...,...,...,...,...,...,...
41539,Mindanao,119773,2124,1.773355,0,0,0,0
41540,Mindanao,137320,2568,1.870084,0,0,0,0
41541,Mindanao,133171,2040,1.531865,0,0,0,0
41542,Mindanao,129500,2412,1.862548,0,0,0,0


In [20]:
percentageSlope, percentageIntercept = np.polyfit(df['no. of vehicle'], df['Percentage'], deg = 1)

KeyError: 'no. of vehicle'

In [None]:
fig, ax = plt.subplots(figsize = (10, 6))

x = np.arange(0, 1, 0.01)
sns.boxplot(data = df, x = 'no. of vehicle', y = 'Percentage')
#sns.scatterplot(data = df, x = '# of vehicle', y = 'Percentage')
#sns.lineplot(x = x, y = percentageSlope * x + percentageIntercept, errorbar = None, color = 'black', ax = ax)

In [None]:
df.describe()

In [None]:
zero = df[df['no. of vehicle'] == 0]['Percentage'].mean()
one = df[df['no. of vehicle'] == 1]['Percentage'].mean()
two = df[df['no. of vehicle'] == 2]['Percentage'].mean()
three = df[df['no. of vehicle'] == 3]['Percentage'].mean()
four = df[df['no. of vehicle'] == 4]['Percentage'].mean()
five = df[df['no. of vehicle'] == 5]['Percentage'].mean()
six = df[df['no. of vehicle'] == 6]['Percentage'].mean()
seven = df[df['no. of vehicle'] == 7]['Percentage'].mean()
eight = df[df['no. of vehicle'] == 8]['Percentage'].mean()

In [None]:
numberOfVehiclePercent = [zero, one, two, three, four, five, six, seven, eight]

In [None]:
intercept, slope = np.polyfit(range(9), numberOfVehiclePercent, deg = 1)

In [None]:
fig, ax = plt.subplots(figsize = (10, 6))
x = np.arange(0, 9, 1)
sns.lineplot(x = x, y = slope * x + intercept, color = 'black', ax = ax)
sns.scatterplot(x = range(9), y = numberOfVehiclePercent)

In [None]:
eight

In [None]:
intercept

In [None]:
slope

In [None]:
for i in range(9):
    print(slope * i + intercept)

In [None]:
from datascience import Table
import numpy as np
import matplotlib.pyplot as plots
import scipy as sp
%matplotlib inline
import statsmodels.formula.api as smf
plots.style.use('fivethirtyeight')

In [None]:
mod = smf.ols(formula='Percentage ~ numbvehicle', data=df)     #binago ko yung # of vehicle to numbvehicle di ko kasi ma code
res = mod.fit()
print(res.summary())

__Percentage and number of vehicle on all region__

the model explains 3.2% of the variance. It's a low value, suggesting that the model might not fit the data well.  
the F-statistic is 1357, and the p-value is very close to zero (2.71e-292), indicating the model is statistically significant.  
The intercept is 4.1004. when the independent variable(s) is zero, the predicted value of the dependent variable is 4.1004.  
The coefficient for this variable is 0.9950 which means for each unit increase in the number of vehicles, the predicted value of the dependent variable (Percentage) increases by 0.9950.  
the p-values are very close to zero both the intercept and the number of vehicles are statistically significant.


In [None]:
df

In [None]:
regions = df['Island'].unique()

for region in regions:
    # Subset the DataFrame for the current region
    df_Island = df[df['Island'] == region]

    # Run OLS regression for the current region
    mod = smf.ols(formula='Percentage ~ numbvehicle', data=df_Island)     
    res = mod.fit()

    # Print the summary for the current region
    print(f"Regression Summary for {region}:\n{res.summary()}\n")

In [None]:
df