<a href="https://colab.research.google.com/github/Raghavendarlokineni/colab-machine-learning/blob/develop/refactoring_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
# This is an exercise from Udacity program
```



In [37]:
import pandas as pd
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";")

print(df.head())

   fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
0            7.4              0.70         0.00  ...       0.56      9.4        5
1            7.8              0.88         0.00  ...       0.68      9.8        5
2            7.8              0.76         0.04  ...       0.65      9.8        5
3           11.2              0.28         0.56  ...       0.58      9.8        6
4            7.4              0.70         0.00  ...       0.56      9.4        5

[5 rows x 12 columns]


**Renaming columns manually**

In [38]:
new_df = df.rename(columns={
                             'fixed acidity': 'fixed_acidity',
                             'volatile acidity': 'volatile_acidity',
                             'citric acid': 'citric_acid',
                             'residual sugar': 'residual_sugar',
                             'free sulfur dioxide': 'free_sulfur_dioxide',
                             'total sulfur dioxide': 'total_sulfur_dioxide'
})

print(new_df.columns)

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


another way of doing this

In [39]:
labels = list(df.columns)
labels[0] = labels[0].replace(" ", "_")
labels[1] = labels[1].replace(" ", "_")
labels[2] = labels[2].replace(" ", "_")
labels[3] = labels[3].replace(" ", "_")
labels[4] = labels[4].replace(" ", "_")
labels[5] = labels[5].replace(" ", "_")
labels[6] = labels[6].replace(" ", "_")
df.columns = labels

print(df.columns)


Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


**Better way of renaming columns of this dataset**

In [0]:
'''
labels = list(df.columns)

for i in range(len(labels)):
    labels[i] = labels[i].replace(" ", "_")

df.columns = labels
print(df.columns)   
'''
df.columns = [label.replace(" ", "_") for label in df.columns]

**Analysing features**

In [41]:
'''
median_alcohol = df.alcohol.median()
for i, alcohol in enumerate(df.alcohol):
    if alcohol >= median_alcohol:
        df.loc[i, 'alcohol'] = "high"
    else:
        df.loc[i, 'alcohol'] = "low"    

df.groupby('alcohol').quality.mean()
'''

'\nmedian_alcohol = df.alcohol.median()\nfor i, alcohol in enumerate(df.alcohol):\n    if alcohol >= median_alcohol:\n        df.loc[i, \'alcohol\'] = "high"\n    else:\n        df.loc[i, \'alcohol\'] = "low"    \n\ndf.groupby(\'alcohol\').quality.mean()\n'

In [42]:
'''
median_pH = df.pH.median()
for i, pH in enumerate(df.pH):
    if pH >= median_pH:
        df.loc[i, 'pH'] = 'high'
    else:
        df.loc[i, 'pH'] = 'low'
df.groupby('pH').quality.mean()
'''

"\nmedian_pH = df.pH.median()\nfor i, pH in enumerate(df.pH):\n    if pH >= median_pH:\n        df.loc[i, 'pH'] = 'high'\n    else:\n        df.loc[i, 'pH'] = 'low'\ndf.groupby('pH').quality.mean()\n"

**Finding out `mean` for each feature is getting repetitive. So now lets make it modular so that it can be used to calculate mean by just passing feature as an argument.**

In [43]:
def calculate_mean(df, column_name):
    median = df[column_name].median()
    for i, column_value in enumerate(df[column_name]):
        if column_value >= median:
            df.loc[i, column_name] = "high"
        else:
            df.loc[i, column_name] = "low"


for feature in df.columns[:-1]:
    calculate_mean(df, feature)
    print(df.groupby(feature).quality.mean(), "\n")

fixed_acidity
high    5.726061
low     5.540052
Name: quality, dtype: float64 

volatile_acidity
high    5.392157
low     5.890166
Name: quality, dtype: float64 

citric_acid
high    5.822360
low     5.447103
Name: quality, dtype: float64 

residual_sugar
high    5.665880
low     5.602394
Name: quality, dtype: float64 

chlorides
high    5.507194
low     5.776471
Name: quality, dtype: float64 

free_sulfur_dioxide
high    5.595268
low     5.677136
Name: quality, dtype: float64 

total_sulfur_dioxide
high    5.522981
low     5.750630
Name: quality, dtype: float64 

density
high    5.540574
low     5.731830
Name: quality, dtype: float64 

pH
high    5.598039
low     5.675607
Name: quality, dtype: float64 

sulphates
high    5.898917
low     5.351562
Name: quality, dtype: float64 

alcohol
high    5.958904
low     5.310302
Name: quality, dtype: float64 

