# PRE-MODELING DATA PREP


## Reshaping Data

In [3]:
# create a sample dataframe
import pandas as pd

songs_dict = {
                'Customer': ['Aria', 'Aria', 'Aria', 'Chord', 'Chord', 'Harmony', 'Harmony', 'Harmony', 'Melody', 'Reed'],
                'Genre': ['Pop', 'Indie', 'Rock', 'Pop', 'Indie', 'Pop', 'Indie', 'Rock', 'Rock', 'Rock'],
                '# Songs': [50, 48, 1, 15, 36, 10, 5, 3, 2, 5]
             }
songs_dict

{'Customer': ['Aria',
  'Aria',
  'Aria',
  'Chord',
  'Chord',
  'Harmony',
  'Harmony',
  'Harmony',
  'Melody',
  'Reed'],
 'Genre': ['Pop',
  'Indie',
  'Rock',
  'Pop',
  'Indie',
  'Pop',
  'Indie',
  'Rock',
  'Rock',
  'Rock'],
 '# Songs': [50, 48, 1, 15, 36, 10, 5, 3, 2, 5]}

In [7]:
df = pd.DataFrame(songs_dict) #transforms the above dictionary to table/dataframe
df

Unnamed: 0,Customer,Genre,# Songs
0,Aria,Pop,50
1,Aria,Indie,48
2,Aria,Rock,1
3,Chord,Pop,15
4,Chord,Indie,36
5,Harmony,Pop,10
6,Harmony,Indie,5
7,Harmony,Rock,3
8,Melody,Rock,2
9,Reed,Rock,5


In [25]:
# group by
df.groupby('Customer')['# Songs'].sum().reset_index()
#the .reset_index() creates an index column 

Unnamed: 0,Customer,# Songs
0,Aria,99
1,Chord,51
2,Harmony,18
3,Melody,2
4,Reed,5


In [35]:
#To make it more readable
customers_songs = (df.groupby('Customer')['# Songs']
                     .sum()
                     .reset_index())

In [37]:
customers_songs 

Unnamed: 0,Customer,# Songs
0,Aria,99
1,Chord,51
2,Harmony,18
3,Melody,2
4,Reed,5


In [47]:
#pivot 
(df.pivot(index='Customer',
          columns = 'Genre', 
          values = '# Songs'))

Genre,Indie,Pop,Rock
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aria,48.0,50.0,1.0
Chord,36.0,15.0,
Harmony,5.0,10.0,3.0
Melody,,,2.0
Reed,,,5.0


In [53]:
(df.pivot(index='Customer',
          columns = 'Genre', 
          values = '# Songs')
.fillna(0))

Genre,Indie,Pop,Rock
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aria,48.0,50.0,1.0
Chord,36.0,15.0,0.0
Harmony,5.0,10.0,3.0
Melody,0.0,0.0,2.0
Reed,0.0,0.0,5.0


In [55]:
(df.pivot(index='Customer',
          columns = 'Genre', 
          values = '# Songs')
.fillna(0)
.reset_index())

Genre,Customer,Indie,Pop,Rock
0,Aria,48.0,50.0,1.0
1,Chord,36.0,15.0,0.0
2,Harmony,5.0,10.0,3.0
3,Melody,0.0,0.0,2.0
4,Reed,0.0,0.0,5.0


In [59]:
customers_genre = (df.pivot(index='Customer',
          columns = 'Genre', 
          values = '# Songs')
.fillna(0)
.reset_index())
customers_genre

Genre,Customer,Indie,Pop,Rock
0,Aria,48.0,50.0,1.0
1,Chord,36.0,15.0,0.0
2,Harmony,5.0,10.0,3.0
3,Melody,0.0,0.0,2.0
4,Reed,0.0,0.0,5.0


# Prepping Cols for Modeling

In [91]:
customers_raw = pd.read_csv(r"C:\Users\user\Desktop\ML Projects\ML Dataset\Data\customers.csv")
customers_raw.head(3)


Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,"$45,000",5/18/23,Yes,College
1,Chord,19.0,12.0,"$28,000",8/23/23,Yes,High School
2,Harmony,26.0,,"$120,000",4/25/23,No,Graduate School


In [83]:
customers_raw.isna()

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False
3,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False
5,False,True,False,False,False,False,False
6,False,True,True,False,False,False,False
7,False,False,False,False,False,False,False


In [95]:
customers_raw.isna().any(axis = 1)

0    False
1    False
2     True
3     True
4    False
5     True
6     True
7    False
dtype: bool

In [97]:
customers = customers_raw
customers 

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,"$45,000",5/18/23,Yes,College
1,Chord,19.0,12.0,"$28,000",8/23/23,Yes,High School
2,Harmony,26.0,,"$120,000",4/25/23,No,Graduate School
3,Melody,47.0,,"$450,000",5/5/23,No,College
4,Reed,52.0,0.0,"$75,000",6/14/23,Yes,High School
5,Selena,,1.0,"$62,000",8/26/23,No,College
6,Stefani,,,"$81,000",9/24/23,No,College
7,Taylor,33.0,52.0,"$60,000",9/8/23,No,High School


In [103]:
#To show the actual rows with the missing values
customers[customers.isna().any(axis=1)]

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
2,Harmony,26.0,,"$120,000",4/25/23,No,Graduate School
3,Melody,47.0,,"$450,000",5/5/23,No,College
5,Selena,,1.0,"$62,000",8/26/23,No,College
6,Stefani,,,"$81,000",9/24/23,No,College


## Handling Missing Data

###### 1. We can drop the rows with missing values if they are very few since it won't be noticed
###### 2. We can fill in the rows with the median age. We use median to avoid skewness and rep the entire data points
###### 3. We can fill in using intuition and domain expertise

In [120]:
#1. Dropping the rows
customers.dropna().reset_index(drop = True )

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,"$45,000",5/18/23,Yes,College
1,Chord,19.0,12.0,"$28,000",8/23/23,Yes,High School
2,Reed,52.0,0.0,"$75,000",6/14/23,Yes,High School
3,Taylor,33.0,52.0,"$60,000",9/8/23,No,High School


In [132]:
col_dropped = customers.dropna().any(axis = 1).reset_index(drop = True )
col_dropped

0    True
1    True
2    True
3    True
dtype: bool

In [124]:
customers_dropped = customers.dropna().reset_index(drop = True)
customers_dropped

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,"$45,000",5/18/23,Yes,College
1,Chord,19.0,12.0,"$28,000",8/23/23,Yes,High School
2,Reed,52.0,0.0,"$75,000",6/14/23,Yes,High School
3,Taylor,33.0,52.0,"$60,000",9/8/23,No,High School


In [134]:
col_dropped = customers.dropna().any(axis = 1).reset_index(drop = True )
col_dropped

0    True
1    True
2    True
3    True
dtype: bool

In [142]:
# Fill missing values with the median Age of the data
customer_median_age = customers.Age.median()

In [152]:
round(customers.Age.fillna(customer_median_age))

0    25.0
1    19.0
2    26.0
3    47.0
4    52.0
5    30.0
6    30.0
7    33.0
Name: Age, dtype: float64

In [156]:
round(customers.Followers.fillna(customer_median_age))

0     0.0
1    12.0
2    30.0
3    30.0
4     0.0
5     1.0
6    30.0
7    52.0
Name: Followers, dtype: float64

In [403]:
customers.Age = round(customers.Age.fillna(customer_median_age))
customers.Followers = round(customers.Followers.fillna(customer_median_age))

In [160]:
customers

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,"$45,000",5/18/23,Yes,College
1,Chord,19.0,12.0,"$28,000",8/23/23,Yes,High School
2,Harmony,26.0,30.0,"$120,000",4/25/23,No,Graduate School
3,Melody,47.0,30.0,"$450,000",5/5/23,No,College
4,Reed,52.0,0.0,"$75,000",6/14/23,Yes,High School
5,Selena,30.0,1.0,"$62,000",8/26/23,No,College
6,Stefani,30.0,30.0,"$81,000",9/24/23,No,College
7,Taylor,33.0,52.0,"$60,000",9/8/23,No,High School


In [None]:
#Converting to NUmeric 

In [162]:
customers.dtypes

Name                object
Age                float64
Followers          float64
Income              object
Sign Up Date        object
Discount            object
Education Level     object
dtype: object

In [168]:
customers.Income

0     $45,000 
1     $28,000 
2    $120,000 
3    $450,000 
4     $75,000 
5     $62,000 
6     $81,000 
7     $60,000 
Name: Income, dtype: object

In [172]:
#Replace the dollar sign and commas
customers.Income = customers.Income.str.replace('$','').str.replace(',','')
customers.Income

0     45000 
1     28000 
2    120000 
3    450000 
4     75000 
5     62000 
6     81000 
7     60000 
Name: Income, dtype: object

In [176]:
customers.Income = pd.to_numeric(customers.Income)
customers.Income

0     45000
1     28000
2    120000
3    450000
4     75000
5     62000
6     81000
7     60000
Name: Income, dtype: int64

In [409]:
customers

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,Sign Up Day,Sign Up Weekday,College,Graduate School,High School
0,Aria,25,0,45000,1,5,3,3,1,0,0
1,Chord,19,12,28000,1,8,2,2,0,0,1
2,Harmony,26,30,120000,0,4,1,1,0,1,0
3,Melody,47,30,450000,0,5,4,4,1,0,0
4,Reed,52,0,75000,1,6,2,2,0,0,1
5,Selena,30,1,62000,0,8,5,5,1,0,0
6,Stefani,30,30,81000,0,9,6,6,1,0,0
7,Taylor,33,52,60000,0,9,4,4,0,0,1


In [180]:
customers.dtypes

Name                object
Age                float64
Followers          float64
Income               int64
Sign Up Date        object
Discount            object
Education Level     object
dtype: object

In [411]:
customers.Age = customers.Age.astype(int)
customers.Followers = customers.Followers.astype(int)


In [190]:
customers.dtypes

Name               object
Age                 int32
Followers           int32
Income              int64
Sign Up Date       object
Discount           object
Education Level    object
dtype: object

In [413]:
#Converting to Datetime: date time often read as text by pandas can be converted to date with the pd.to_datetime()
customers['Sign Up Date'].head()

KeyError: 'Sign Up Date'

In [415]:
customers['Sign Up Date'] = pd.to_datetime(customers['Sign Up Date'], format == '%m/%d/%y')
customers['Sign Up Date']

KeyError: 'Sign Up Date'

In [208]:
customers.dtypes

Name                       object
Age                         int32
Followers                   int32
Income                      int64
Sign Up Date       datetime64[ns]
Discount                   object
Education Level            object
dtype: object

In [None]:
#Calculating as Condition: Extracting features from datetime

In [401]:
customers['Sign Up Month'] = customers['Sign Up Date'].dt.month

KeyError: 'Sign Up Date'

In [214]:
customers['Sign Up Month'].dtype

dtype('int32')

In [224]:
customers['Sign Up Weekday'] = customers['Sign Up Date'].dt.dayofweek

In [226]:
customers['Sign Up Weekday']

0    3
1    2
2    1
3    4
4    2
5    5
6    6
7    4
Name: Sign Up Weekday, dtype: int32

In [222]:
customers.dtypes


Name                       object
Age                         int32
Followers                   int32
Income                      int64
Sign Up Date       datetime64[ns]
Discount                   object
Education Level            object
Sign Up Month               int32
Sign Up Day                 int32
dtype: object

In [234]:
customers = customers.drop(columns =('Sign Up Date'))


KeyError: "['Sign Up Date'] not found in axis"

In [236]:
customers 

Unnamed: 0,Name,Age,Followers,Income,Discount,Education Level,Sign Up Month,Sign Up Day,Sign Up Weekday
0,Aria,25,0,45000,Yes,College,5,3,3
1,Chord,19,12,28000,Yes,High School,8,2,2
2,Harmony,26,30,120000,No,Graduate School,4,1,1
3,Melody,47,30,450000,No,College,5,4,4
4,Reed,52,0,75000,Yes,High School,6,2,2
5,Selena,30,1,62000,No,College,8,5,5
6,Stefani,30,30,81000,No,College,9,6,6
7,Taylor,33,52,60000,No,High School,9,4,4


## Conditional Logic

In [241]:
import numpy as np

In [243]:
np.where(customers.Discount == 'Yes', 1, 0)

array([1, 1, 0, 0, 1, 0, 0, 0])

In [247]:
customers['Discount'] = np.where(customers.Discount == 'Yes', 1, 0)

In [249]:
customers

Unnamed: 0,Name,Age,Followers,Income,Discount,Education Level,Sign Up Month,Sign Up Day,Sign Up Weekday
0,Aria,25,0,45000,1,College,5,3,3
1,Chord,19,12,28000,1,High School,8,2,2
2,Harmony,26,30,120000,0,Graduate School,4,1,1
3,Melody,47,30,450000,0,College,5,4,4
4,Reed,52,0,75000,1,High School,6,2,2
5,Selena,30,1,62000,0,College,8,5,5
6,Stefani,30,30,81000,0,College,9,6,6
7,Taylor,33,52,60000,0,High School,9,4,4


## Dummy Variables

In [254]:
pd.get_dummies(customers['Education Level'])

Unnamed: 0,College,Graduate School,High School
0,True,False,False
1,False,False,True
2,False,True,False
3,True,False,False
4,False,False,True
5,True,False,False
6,True,False,False
7,False,False,True


In [256]:
pd.get_dummies(customers['Education Level']).astype(int)

Unnamed: 0,College,Graduate School,High School
0,1,0,0
1,0,0,1
2,0,1,0
3,1,0,0
4,0,0,1
5,1,0,0
6,1,0,0
7,0,0,1


In [258]:
dummies_edu = pd.get_dummies(customers['Education Level']).astype(int) 

In [260]:
customers

Unnamed: 0,Name,Age,Followers,Income,Discount,Education Level,Sign Up Month,Sign Up Day,Sign Up Weekday
0,Aria,25,0,45000,1,College,5,3,3
1,Chord,19,12,28000,1,High School,8,2,2
2,Harmony,26,30,120000,0,Graduate School,4,1,1
3,Melody,47,30,450000,0,College,5,4,4
4,Reed,52,0,75000,1,High School,6,2,2
5,Selena,30,1,62000,0,College,8,5,5
6,Stefani,30,30,81000,0,College,9,6,6
7,Taylor,33,52,60000,0,High School,9,4,4


In [266]:
#combine the customers_table and the dummies_edu table
customers = pd.concat((customers, dummies_edu), axis = 1)

In [270]:
customers.drop(columns = 'Education Level')

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,Sign Up Day,Sign Up Weekday,College,Graduate School,High School
0,Aria,25,0,45000,1,5,3,3,1,0,0
1,Chord,19,12,28000,1,8,2,2,0,0,1
2,Harmony,26,30,120000,0,4,1,1,0,1,0
3,Melody,47,30,450000,0,5,4,4,1,0,0
4,Reed,52,0,75000,1,6,2,2,0,0,1
5,Selena,30,1,62000,0,8,5,5,1,0,0
6,Stefani,30,30,81000,0,9,6,6,1,0,0
7,Taylor,33,52,60000,0,9,4,4,0,0,1


In [274]:
customers = customers.drop(columns = 'Education Level')

In [276]:
customers

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,Sign Up Day,Sign Up Weekday,College,Graduate School,High School
0,Aria,25,0,45000,1,5,3,3,1,0,0
1,Chord,19,12,28000,1,8,2,2,0,0,1
2,Harmony,26,30,120000,0,4,1,1,0,1,0
3,Melody,47,30,450000,0,5,4,4,1,0,0
4,Reed,52,0,75000,1,6,2,2,0,0,1
5,Selena,30,1,62000,0,8,5,5,1,0,0
6,Stefani,30,30,81000,0,9,6,6,1,0,0
7,Taylor,33,52,60000,0,9,4,4,0,0,1


###### Observe we have numeric values in all the colmuns above. Good Job!

## Feature Engineering

In [280]:
# the data from setting the row granularity, with a few more customers
songs_genres_dict = {'Customer': ['Aria', 'Chord', 'Harmony', 'Melody', 'Reed', 'Selena', 'Stefani', 'Taylor'],
                     '# Songs': [99, 51, 18, 2, 5, 60, 15, 121],
                     'Indie': [48, 36, 5, 0, 0, 20, 2, 19],
                     'Pop': [50, 15, 10, 0, 0, 20, 5, 89],
                     'Rock': [1, 0, 3, 2, 5, 20, 8, 13]}

songs_genres = pd.DataFrame(songs_genres_dict)
songs_genres

Unnamed: 0,Customer,# Songs,Indie,Pop,Rock
0,Aria,99,48,50,1
1,Chord,51,36,15,0
2,Harmony,18,5,10,3
3,Melody,2,0,0,2
4,Reed,5,0,0,5
5,Selena,60,20,20,20
6,Stefani,15,2,5,8
7,Taylor,121,19,89,13


In [313]:
model_df = pd.concat([customers,songs_genres], axis = 1).drop(columns=['Customer'])
model_df= model_df.drop(columns='Sign Up Day')
model_df

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,Sign Up Weekday,College,Graduate School,High School,# Songs,Indie,Pop,Rock
0,Aria,25,0,45000,1,5,3,1,0,0,99,48,50,1
1,Chord,19,12,28000,1,8,2,0,0,1,51,36,15,0
2,Harmony,26,30,120000,0,4,1,0,1,0,18,5,10,3
3,Melody,47,30,450000,0,5,4,1,0,0,2,0,0,2
4,Reed,52,0,75000,1,6,2,0,0,1,5,0,0,5
5,Selena,30,1,62000,0,8,5,1,0,0,60,20,20,20
6,Stefani,30,30,81000,0,9,6,1,0,0,15,2,5,8
7,Taylor,33,52,60000,0,9,4,0,0,1,121,19,89,13


In [294]:
#check number of pop in each row
model_df['Pop']

0    50
1    15
2    10
3     0
4     0
5    20
6     5
7    89
Name: Pop, dtype: int64

In [296]:
model_df['# Songs']

0     99
1     51
2     18
3      2
4      5
5     60
6     15
7    121
Name: # Songs, dtype: int64

In [375]:
model_df['pct_pop'] = model_df['Pop']/model_df['# Songs']
model_df['pct_pop']
model_df

Unnamed: 0,Age,Followers,Income,Discount,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Weekend,pct_pop
0,25,0,45000,1,1,0,0,99,48,50,1,0,0.505051
1,19,12,28000,1,0,0,1,51,36,15,0,0,0.294118
2,26,30,120000,0,0,1,0,18,5,10,3,0,0.555556
3,47,30,450000,0,1,0,0,2,0,0,2,0,0.0
4,52,0,75000,1,0,0,1,5,0,0,5,0,0.0
5,30,1,62000,0,1,0,0,60,20,20,20,1,0.333333
6,30,30,81000,0,1,0,0,15,2,5,8,1,0.333333
7,33,52,60000,0,0,0,1,121,19,89,13,0,0.735537


In [304]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8 non-null      object 
 1   Age              8 non-null      int32  
 2   Followers        8 non-null      int32  
 3   Income           8 non-null      int64  
 4   Discount         8 non-null      int32  
 5   Sign Up Month    8 non-null      int32  
 6   Sign Up Day      8 non-null      int32  
 7   Sign Up Weekday  8 non-null      int32  
 8   College          8 non-null      int32  
 9   Graduate School  8 non-null      int32  
 10  High School      8 non-null      int32  
 11  # Songs          8 non-null      int64  
 12  Indie            8 non-null      int64  
 13  Pop              8 non-null      int64  
 14  Rock             8 non-null      int64  
 15  pct_pop          8 non-null      float64
dtypes: float64(1), int32(9), int64(5), object(1)
memory usage: 868.0+ 

## Binning Values

In [333]:
#Binning Values is used for cases like having numbers represent weekdays (mon-to Fri, and weekends(Sat and sun). in this case, 6 which is sat might be interpreted as higher/better than Mondays). thus we bin the values into two categories: 1 to rep weekends and 0 to rep weekdays
model_df['Weekend'] = np.where(customers['Sign Up Weekday'].isin([5,6]), 1, 0)


In [335]:

model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Name             8 non-null      object
 1   Age              8 non-null      int32 
 2   Followers        8 non-null      int32 
 3   Income           8 non-null      int64 
 4   Discount         8 non-null      int32 
 5   Sign Up Month    8 non-null      int32 
 6   College          8 non-null      int32 
 7   Graduate School  8 non-null      int32 
 8   High School      8 non-null      int32 
 9   # Songs          8 non-null      int64 
 10  Indie            8 non-null      int64 
 11  Pop              8 non-null      int64 
 12  Rock             8 non-null      int64 
 13  Weekend          8 non-null      int32 
dtypes: int32(8), int64(5), object(1)
memory usage: 772.0+ bytes


In [377]:
model_df.head(10)

Unnamed: 0,Age,Followers,Income,Discount,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Weekend,pct_pop
0,25,0,45000,1,1,0,0,99,48,50,1,0,0.505051
1,19,12,28000,1,0,0,1,51,36,15,0,0,0.294118
2,26,30,120000,0,0,1,0,18,5,10,3,0,0.555556
3,47,30,450000,0,1,0,0,2,0,0,2,0,0.0
4,52,0,75000,1,0,0,1,5,0,0,5,0,0.0
5,30,1,62000,0,1,0,0,60,20,20,20,1,0.333333
6,30,30,81000,0,1,0,0,15,2,5,8,1,0.333333
7,33,52,60000,0,0,0,1,121,19,89,13,0,0.735537


## Proxy Variables

In [379]:
# external data with the average temperature in F each month in Chicago
avg_temp_dict = {'Month': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                 'Avg_Temp': [32, 36, 45, 56, 66, 77, 82, 81, 74, 62, 50, 37]}

avg_temp = pd.DataFrame(avg_temp_dict)
avg_temp

Unnamed: 0,Month,Avg_Temp
0,1,32
1,2,36
2,3,45
3,4,56
4,5,66
5,6,77
6,7,82
7,8,81
8,9,74
9,10,62


In [381]:
# we merge the model_df and avg_temp tables. it is similar to Joins in SQL and Vlookup in excel. the concept here is we are joing based on the common columns both tables have
model_df = pd.merge(model_df, avg_temp, left_on='Sign Up Month', right_on='Month')


KeyError: 'Sign Up Month'

In [352]:
model_df = model_df.drop(columns = 'Sign Up Month')

In [385]:
df = pd.read_csv(r"C:\Users\user\Desktop\ML Projects\ML Dataset\Data\customers.csv")

In [399]:
df['Sign Up Date'] = pd.to_datetime(df['Sign Up Date'], format == '%m/%d/%y')

  df['Sign Up Date'] = pd.to_datetime(df['Sign Up Date'], format == '%m/%d/%y')


AssertionError: 

In [354]:
model_df

Unnamed: 0,Name,Age,Followers,Income,Discount,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Weekend
0,Aria,25,0,45000,1,1,0,0,99,48,50,1,0
1,Chord,19,12,28000,1,0,0,1,51,36,15,0,0
2,Harmony,26,30,120000,0,0,1,0,18,5,10,3,0
3,Melody,47,30,450000,0,1,0,0,2,0,0,2,0
4,Reed,52,0,75000,1,0,0,1,5,0,0,5,0
5,Selena,30,1,62000,0,1,0,0,60,20,20,20,1
6,Stefani,30,30,81000,0,1,0,0,15,2,5,8,1
7,Taylor,33,52,60000,0,0,0,1,121,19,89,13,0


# Feature Selection

In [357]:
model_df

Unnamed: 0,Name,Age,Followers,Income,Discount,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Weekend
0,Aria,25,0,45000,1,1,0,0,99,48,50,1,0
1,Chord,19,12,28000,1,0,0,1,51,36,15,0,0
2,Harmony,26,30,120000,0,0,1,0,18,5,10,3,0
3,Melody,47,30,450000,0,1,0,0,2,0,0,2,0
4,Reed,52,0,75000,1,0,0,1,5,0,0,5,0
5,Selena,30,1,62000,0,1,0,0,60,20,20,20,1
6,Stefani,30,30,81000,0,1,0,0,15,2,5,8,1
7,Taylor,33,52,60000,0,0,0,1,121,19,89,13,0


In [365]:
#To exclude the 'Name' column, since it is an identifier, we save it as a series
#first make a copy
names = model_df.Name
names
#then drop the main table
model_df = model_df.drop(columns = 'Name')


AttributeError: 'DataFrame' object has no attribute 'Name'

In [367]:
model_df

Unnamed: 0,Age,Followers,Income,Discount,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Weekend
0,25,0,45000,1,1,0,0,99,48,50,1,0
1,19,12,28000,1,0,0,1,51,36,15,0,0
2,26,30,120000,0,0,1,0,18,5,10,3,0
3,47,30,450000,0,1,0,0,2,0,0,2,0
4,52,0,75000,1,0,0,1,5,0,0,5,0
5,30,1,62000,0,1,0,0,60,20,20,20,1
6,30,30,81000,0,1,0,0,15,2,5,8,1
7,33,52,60000,0,0,0,1,121,19,89,13,0


In [369]:
names

0       Aria
1      Chord
2    Harmony
3     Melody
4       Reed
5     Selena
6    Stefani
7     Taylor
Name: Name, dtype: object