# Student Aggregation Analysis

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks", color_codes=True)
from statsmodels.graphics.mosaicplot import mosaic
%matplotlib inline

In [34]:
# Importing the Economics Survey data
join_df = pd.read_csv('./data/join.csv')
join_df['Pass'] = np.where(join_df["Final"] >= 0.68, "Yes", "No")
join_df['Class'] = join_df['Class'].astype(int).astype(str)
join_df['Year'] = join_df['Year'].astype(int).astype(str)
join_df['Taste'] = join_df['Taste'].astype(int)
join_df['Price'] = join_df['Price'].astype(int)
join_df['Nutrition'] = join_df['Nutrition'].astype(int)
join_df['Presentation'] = join_df['Presentation'].astype(int)
join_df['Height'] = join_df['Feet']*12 + join_df['Inches']
join_df['BMI'] = (join_df['Weight'] / (join_df['Height']**2))*703
join_df.head()

Unnamed: 0,id,School,Gender,Race,Local,Full Time,Future,Effectiveness,Online Supplement,Supplement Use,...,Parents Overweight,Final,Attend,Year,Semester,Class,Platform,Pass,Height,BMI
0,68968,Diablo Valley College,Male,Asian,Local Student,Full time,Very helpful,Very effective,Very helpful,Somewhat difficult,...,No,0.81252,0.791667,2016,Spring,101,Aplia,Yes,69.0,22.148708
1,46998,Diablo Valley College,Female,Asian,Local Student,Full time,Somewhat helpful,Somewhat effective,Somewhat helpful,Not too difficult,...,Not sure,0.640884,1.0,2016,Spring,101,Aplia,No,64.0,27.460938
2,79024,Diablo Valley College,Female,African American,Local Student,Full time,Somewhat helpful,Somewhat effective,Somewhat helpful,Somewhat difficult,...,No,0.681216,1.0,2016,Spring,101,Aplia,Yes,69.0,22.148708
3,28231,Diablo Valley College,Male,White,Local Student,Part time,Somewhat helpful,Somewhat effective,Somewhat helpful,Somewhat difficult,...,No,0.878339,1.0,2016,Spring,101,Aplia,Yes,72.0,33.902392
4,85016,Diablo Valley College,Male,Asian,Local Student,Full time,Not too helpful,Not too effective,Somewhat helpful,Somewhat difficult,...,No,0.501591,0.92,2016,Spring,102,Aplia,No,71.0,25.102162


In [35]:
# Create the Obese column
join_df['Obese'] = ""
for i in range(len(join_df["BMI"])):
    if join_df["BMI"][i] < 18.5:
        join_df['Obese'][i] = "Under Weight"
    elif join_df["BMI"][i] < 25.1:
        join_df["Obese"][i] = "Normal"
    elif join_df["BMI"][i] < 30.1:
        join_df["Obese"][i] = "Over Weight"
    else:
        join_df["Obese"][i] = "Obese"
        
join_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,id,School,Gender,Race,Local,Full Time,Future,Effectiveness,Online Supplement,Supplement Use,...,Final,Attend,Year,Semester,Class,Platform,Pass,Height,BMI,Obese
0,68968,Diablo Valley College,Male,Asian,Local Student,Full time,Very helpful,Very effective,Very helpful,Somewhat difficult,...,0.81252,0.791667,2016,Spring,101,Aplia,Yes,69.0,22.148708,Normal
1,46998,Diablo Valley College,Female,Asian,Local Student,Full time,Somewhat helpful,Somewhat effective,Somewhat helpful,Not too difficult,...,0.640884,1.0,2016,Spring,101,Aplia,No,64.0,27.460938,Over Weight
2,79024,Diablo Valley College,Female,African American,Local Student,Full time,Somewhat helpful,Somewhat effective,Somewhat helpful,Somewhat difficult,...,0.681216,1.0,2016,Spring,101,Aplia,Yes,69.0,22.148708,Normal
3,28231,Diablo Valley College,Male,White,Local Student,Part time,Somewhat helpful,Somewhat effective,Somewhat helpful,Somewhat difficult,...,0.878339,1.0,2016,Spring,101,Aplia,Yes,72.0,33.902392,Obese
4,85016,Diablo Valley College,Male,Asian,Local Student,Full time,Not too helpful,Not too effective,Somewhat helpful,Somewhat difficult,...,0.501591,0.92,2016,Spring,102,Aplia,No,71.0,25.102162,Over Weight


In [29]:
join_df.columns

Index(['id', 'School', 'Gender', 'Race', 'Local', 'Full Time', 'Future',
       'Effectiveness', 'Online Supplement', 'Supplement Use', 'Feet',
       'Inches', 'Weight', 'Exercise per Week', 'Exercise Time', 'Read Label',
       'Taste', 'Price', 'Nutrition', 'Presentation', 'Appearance', 'Health',
       'Parents Overweight', 'Final', 'Attend', 'Year', 'Semester', 'Class',
       'Platform', 'Pass', 'Height', 'BMI', 'Obese'],
      dtype='object')

In [55]:
# Create contingency table to show students' preference on food purchase
t = join_df["Taste"].value_counts(sort=True)
p = join_df["Price"].value_counts(sort=True)
pr = join_df["Presentation"].value_counts(sort=True)
n = join_df["Nutrition"].value_counts(sort=True)

In [57]:
print(t)
print(p)
print(n)
print(pr)

Int64Index([1, 2, 3, 4], dtype='int64')
2    398
3    327
1    319
4    183
Name: Price, dtype: int64
3    392
2    341
4    263
1    231
Name: Nutrition, dtype: int64
4    576
3    294
2    182
1    175
Name: Presentation, dtype: int64


In [64]:
pref = join_df[["Taste", "Price", "Presentation", "Nutrition"]]
pref.head()

Unnamed: 0,Taste,Price,Presentation,Nutrition
0,1,2,4,3
1,3,1,2,4
2,1,2,4,3
3,1,2,3,4
4,1,2,3,4


In [65]:
pref = pref.melt(var_name='Preferences', value_name='Ranks')
pref.head()

Unnamed: 0,Preferences,Ranks
0,Taste,1
1,Taste,3
2,Taste,1
3,Taste,1
4,Taste,1


In [63]:
cross_table = pd.crosstab(index=pref['Ranks'], columns=pref['Preferences'])
cross_table

Preferences,Nutrition,Presentation,Price,Taste
Ranks,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,231,175,319,502
2,341,182,398,306
3,392,294,327,214
4,263,576,183,205


In [66]:
join_df.corr()

Unnamed: 0,id,Feet,Inches,Weight,Taste,Price,Nutrition,Presentation,Final,Attend,Height,BMI
id,1.0,-0.008081,0.029631,-0.024919,0.012235,-0.009166,0.03105,-0.033202,0.055993,0.016369,0.00592,-0.012643
Feet,-0.008081,1.0,-0.310509,0.384391,0.027594,0.011431,-0.044471,0.002984,0.024339,-0.015611,0.888384,-0.156093
Inches,0.029631,-0.310509,1.0,0.063837,-0.026942,-0.037804,0.033275,0.031456,0.0037,-0.020808,0.160556,-0.010821
Weight,-0.024919,0.384391,0.063837,1.0,-0.004844,-0.017147,0.024969,-0.002573,-0.011215,-0.090167,0.429964,0.735993
Taste,0.012235,0.027594,-0.026942,-0.004844,1.0,-0.201162,-0.399576,-0.459385,0.040165,6.5e-05,0.01564,-0.019504
Price,-0.009166,0.011431,-0.037804,-0.017147,-0.201162,1.0,-0.369118,-0.378397,0.016281,0.060849,-0.006389,-0.027599
Nutrition,0.03105,-0.044471,0.033275,0.024969,-0.399576,-0.369118,1.0,-0.188665,-0.054629,-0.011801,-0.030106,0.02537
Presentation,-0.033202,0.002984,0.031456,-0.002573,-0.459385,-0.378397,-0.188665,1.0,-0.004832,-0.045646,0.018291,0.021826
Final,0.055993,0.024339,0.0037,-0.011215,0.040165,0.016281,-0.054629,-0.004832,1.0,0.235558,0.027059,-0.02641
Attend,0.016369,-0.015611,-0.020808,-0.090167,6.5e-05,0.060849,-0.011801,-0.045646,0.235558,1.0,-0.026259,-0.075957
