## Importing Libraries


In [0]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



### Data Source: https://collegescorecard.ed.gov

In [0]:
# Loading dataset and selecting only required columns.
Data_URL="https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-All-Data-Elements.csv"

cols=["CONTROL","MD_EARN_WNE_P10"]
Df= pd.read_csv(Data_URL,usecols=cols)

In [484]:
Df.describe(include= "all")

Unnamed: 0,CONTROL,MD_EARN_WNE_P10
count,6806.0,5503
unique,,634
top,,PrivacySuppressed
freq,,686
mean,2.08757,
std,0.835281,
min,1.0,
25%,1.0,
50%,2.0,
75%,3.0,


In [485]:
Df.shape

(6806, 2)

In [486]:
Df.head(10)
Df

Unnamed: 0,CONTROL,MD_EARN_WNE_P10
0,1,31000
1,1,41200
2,2,39600
3,1,46700
4,1,27700
...,...,...
6801,1,
6802,1,
6803,3,
6804,3,


In [487]:
Df.dtypes

CONTROL             int64
MD_EARN_WNE_P10    object
dtype: object

In [488]:
Df["CONTROL"]=Df["CONTROL"].astype(str)
Df.dtypes

CONTROL            object
MD_EARN_WNE_P10    object
dtype: object

In [489]:
# Filter the data to display only public institutions (using CONTROL variable)
# Public institutions ="1" form documentation.

Df=Df[Df["CONTROL"].str.contains("1")]
Df



Unnamed: 0,CONTROL,MD_EARN_WNE_P10
0,1,31000
1,1,41200
3,1,46700
4,1,27700
5,1,44500
...,...,...
6799,1,
6800,1,
6801,1,
6802,1,


In [490]:
# Drop the colleges that have zero or missing potential earnings
Df= Df[Df["MD_EARN_WNE_P10"]!=0]
Df

Unnamed: 0,CONTROL,MD_EARN_WNE_P10
0,1,31000
1,1,41200
3,1,46700
4,1,27700
5,1,44500
...,...,...
6799,1,
6800,1,
6801,1,
6802,1,


In [491]:
# Fiter and drop all row that containa PrivacySuppressed.

Df=Df[Df["MD_EARN_WNE_P10"]!="PrivacySuppressed"]

Df

Unnamed: 0,CONTROL,MD_EARN_WNE_P10
0,1,31000
1,1,41200
3,1,46700
4,1,27700
5,1,44500
...,...,...
6799,1,
6800,1,
6801,1,
6802,1,


In [492]:
# Finding colleges with missing  potential earnings 
Df["MD_EARN_WNE_P10"].isna().sum()

147

In [493]:
Df=Df.fillna(0)
Df

Unnamed: 0,CONTROL,MD_EARN_WNE_P10
0,1,31000
1,1,41200
3,1,46700
4,1,27700
5,1,44500
...,...,...
6799,1,0
6800,1,0
6801,1,0
6802,1,0


In [494]:
Df= Df[Df["MD_EARN_WNE_P10"]!=0]
Df

Unnamed: 0,CONTROL,MD_EARN_WNE_P10
0,1,31000
1,1,41200
3,1,46700
4,1,27700
5,1,44500
...,...,...
6592,1,31700
6607,1,50600
6608,1,50600
6609,1,50600


In [428]:
# droping out colleges with missing potential earnings
#Df=Df["MD_EARN_WNE_P10"].dropna()
#Df #This gives our working population.

TypeError: ignored

In [496]:
#Df["MD_EARN_WNE_P10"]=Df["MD_EARN_WNE_P10"].astype(int)
Df["MD_EARN_WNE_P10"]=Df["MD_EARN_WNE_P10"].astype(int)
Df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,CONTROL,MD_EARN_WNE_P10
0,1,31000
1,1,41200
3,1,46700
4,1,27700
5,1,44500
...,...,...
6592,1,31700
6607,1,50600
6608,1,50600
6609,1,50600


In [497]:
# selecting 50 random colleges from the population
Df.sample(50)

Unnamed: 0,CONTROL,MD_EARN_WNE_P10
3179,1,34900
1807,1,28600
3712,1,34500
2278,1,32100
660,1,29300
2774,1,32300
829,1,22500
3737,1,35900
3980,1,27300
3185,1,29400


In [498]:
# Calculating mean of sample
sample_size=50

Df_sample = Df.sample(sample_size)
sample_mean = Df_sample["MD_EARN_WNE_P10"].mean()
sample_mean    

35570.0

In [499]:
# calculating the sample standard deviation the hard way

Wage_list = list(Df_sample["MD_EARN_WNE_P10"])
degree_of_freedom = sample_size - 1
variance = 0
for wage in Wage_list:
  variance += (wage-sample_mean)**2
sample_std = math.sqrt(variance/degree_of_freedom)
round(sample_std, 2)

11205.16

In [500]:
# Calculating sample standard deviation alternatively using Pandas.
sample_std = Df_sample["MD_EARN_WNE_P10"].std()
round(sample_std, 2)

11205.16

In [501]:
# Calculating sample standard deviation using numpy.
sample_std = np.std(Df_sample["MD_EARN_WNE_P10"], ddof=1)
round(sample_std, 2)

11205.16

In [502]:
# To be able to calculate CI, we must calculate the standard error first 
# Since we assume we don't know the population standard deviation, we use sample standard deviation as an estimate

std_err = sample_std / math.sqrt(sample_size)       
std_err

1584.6489342747054

In [503]:
# Calculate 68% Confidence Interval (CI) - one standard error from the population mean
# 68% chances the population mean is within the sample_mean (+ or -) the standard error (SE)

LCL_68 = sample_mean -  std_err
UCL_68 = sample_mean +  std_err

print("Lower confidence limit at 68% confidence level = ", round(LCL_68,2))
print("Upper confidence limit at 68% confidence level = ", round(UCL_68,2))

Lower confidence limit at 68% confidence level =  33985.35
Upper confidence limit at 68% confidence level =  37154.65


In [504]:
# Calculate 95% Confidence Interval (CI) - one standard error from the population mean
# 90% chances the population mean is within the sample_mean + or - 2 * the standard error (SE)

LCL_95 = sample_mean -  2 * std_err
UCL_95 = sample_mean +  2 * std_err
print("Lower confidence limit at 95% confidence level = ", round(LCL_95,2))
print("Upper confidence limit at 95% confidence level = ", round(UCL_95,2))


Lower confidence limit at 95% confidence level =  32400.7
Upper confidence limit at 95% confidence level =  38739.3


In [505]:
# Calculate 99.7% Confidence Interval (CI) - one standard error from the population mean
# 99.7% chances the population mean is within the sample_mean + or - 3 * the standard error (SE)

LCL_997 = sample_mean -  3 * std_err
UCL_997 = sample_mean +  3 * std_err
print("Lower confidence limit at 99.7% confidence level = ", round(LCL_997,2))
print("Upper confidence limit at 99.7% confidence level = ", round(UCL_997,2))

Lower confidence limit at 99.7% confidence level =  30816.05
Upper confidence limit at 99.7% confidence level =  40323.95


In [506]:
# Calculate population mean

Df["MD_EARN_WNE_P10"].mean()

36083.21640582839

In [507]:
# Sample_Size=100
# Calculating mean of Sample
Sample_Size=100

Df_Sample = Df.sample(Sample_Size)
Sample_mean = Df_Sample["MD_EARN_WNE_P10"].mean()
Sample_mean 

34646.0

In [508]:
# Calculating sample standard deviation alternatively using Pandas.
Sample_std = Df_Sample["MD_EARN_WNE_P10"].std()
round(Sample_std, 2)

10007.09

In [509]:
# To be able to calculate CI, we must calculate the standard error first 
# Since we assume we don't know the population standard deviation, we use sample standard deviation as an estimate

Std_err = Sample_std / math.sqrt(Sample_Size)       
Std_err

1000.709263624071

In [510]:
# Calculate 68% Confidence Interval (CI) - one standard error from the population mean
# 68% chances the population mean is within the sample_mean (+ or -) the standard error (SE)

LCL_68 = Sample_mean -  Std_err
UCL_68 = Sample_mean +  Std_err

print("Lower confidence limit at 68% confidence level = ", round(LCL_68,2))
print("Upper confidence limit at 68% confidence level = ", round(UCL_68,2))

Lower confidence limit at 68% confidence level =  33645.29
Upper confidence limit at 68% confidence level =  35646.71


In [511]:
# Calculate 95% Confidence Interval (CI) - one standard error from the population mean
# 90% chances the population mean is within the sample_mean + or - 2 * the standard error (SE)

LCL_95 = Sample_mean -  2 * Std_err
UCL_95 = Sample_mean +  2 * Std_err
print("Lower confidence limit at 95% confidence level = ", round(LCL_95,2))
print("Upper confidence limit at 95% confidence level = ", round(UCL_95,2))


Lower confidence limit at 95% confidence level =  32644.58
Upper confidence limit at 95% confidence level =  36647.42


In [512]:
# Calculate 99.7% Confidence Interval (CI) - one standard error from the population mean
# 99.7% chances the population mean is within the sample_mean + or - 3 * the standard error (SE)

LCL_997 = Sample_mean -  3 * Std_err
UCL_997 = Sample_mean +  3 * Std_err
print("Lower confidence limit at 99.7% confidence level = ", round(LCL_997,2))
print("Upper confidence limit at 99.7% confidence level = ", round(UCL_997,2))

Lower confidence limit at 99.7% confidence level =  31643.87
Upper confidence limit at 99.7% confidence level =  37648.13


## Obervation.
The higher the sample size, the closer the sample mean to the population mean, and the closer the sample mean is located within the CI of the population mean.