In [1]:
import pandas as pd

raw_df = pd.read_csv('./data/responses.csv')

In [2]:
music_columns = list(raw_df.columns[2:19])
personal_columns = ['Gender', 'Age', 'Village - town', 'Branded clothing', 'Happiness in life']
PERSONAL_SIZE = len(personal_columns)
music_df = raw_df.filter(music_columns + personal_columns, axis=1)
music_df.isnull().sum()

Dance                4
Folk                 5
Country              5
Classical music      7
Musical              2
Pop                  3
Rock                 6
Metal or Hardrock    3
Punk                 8
Hiphop, Rap          4
Reggae, Ska          7
Swing, Jazz          6
Rock n roll          7
Alternative          7
Latino               8
Techno, Trance       7
Opera                1
Gender               6
Age                  7
Village - town       4
Branded clothing     2
Happiness in life    4
dtype: int64

In [3]:
music_df.dropna(inplace=True)

In [4]:
music_df.isnull().sum()

Dance                0
Folk                 0
Country              0
Classical music      0
Musical              0
Pop                  0
Rock                 0
Metal or Hardrock    0
Punk                 0
Hiphop, Rap          0
Reggae, Ska          0
Swing, Jazz          0
Rock n roll          0
Alternative          0
Latino               0
Techno, Trance       0
Opera                0
Gender               0
Age                  0
Village - town       0
Branded clothing     0
Happiness in life    0
dtype: int64

In [5]:
music_df.dtypes

Dance                float64
Folk                 float64
Country              float64
Classical music      float64
Musical              float64
Pop                  float64
Rock                 float64
Metal or Hardrock    float64
Punk                 float64
Hiphop, Rap          float64
Reggae, Ska          float64
Swing, Jazz          float64
Rock n roll          float64
Alternative          float64
Latino               float64
Techno, Trance       float64
Opera                float64
Gender                object
Age                  float64
Village - town        object
Branded clothing     float64
Happiness in life    float64
dtype: object

In [6]:
pd.value_counts(music_df['Village - town'])

city       648
village    270
Name: Village - town, dtype: int64

In [7]:
pd.value_counts(music_df['Branded clothing'])

3.0    256
4.0    225
1.0    159
5.0    142
2.0    136
Name: Branded clothing, dtype: int64

In [8]:
pd.value_counts(music_df['Happiness in life'])

4.0    468
3.0    258
5.0    128
2.0     54
1.0     10
Name: Happiness in life, dtype: int64

In [9]:
music_df['Gender'] = music_df['Gender'].replace("female", 0)
music_df['Gender'] = music_df['Gender'].replace("male", 1)
music_df['Village - town'] = music_df['Village - town'].replace("village", 0)
music_df['Village - town'] = music_df['Village - town'].replace("city", 1)

In [10]:
music_df.rename(columns={'Village - town':'City'}, inplace=True)

In [11]:
music_df.dtypes

Dance                float64
Folk                 float64
Country              float64
Classical music      float64
Musical              float64
Pop                  float64
Rock                 float64
Metal or Hardrock    float64
Punk                 float64
Hiphop, Rap          float64
Reggae, Ska          float64
Swing, Jazz          float64
Rock n roll          float64
Alternative          float64
Latino               float64
Techno, Trance       float64
Opera                float64
Gender                 int64
Age                  float64
City                   int64
Branded clothing     float64
Happiness in life    float64
dtype: object

In [12]:
for i, col in enumerate(music_df):
    #use_df[use_df.columns[each]]
    music_df[music_df.columns[i]] = music_df[col].astype(int)

In [13]:
music_df.dtypes

Dance                int32
Folk                 int32
Country              int32
Classical music      int32
Musical              int32
Pop                  int32
Rock                 int32
Metal or Hardrock    int32
Punk                 int32
Hiphop, Rap          int32
Reggae, Ska          int32
Swing, Jazz          int32
Rock n roll          int32
Alternative          int32
Latino               int32
Techno, Trance       int32
Opera                int32
Gender               int32
Age                  int32
City                 int32
Branded clothing     int32
Happiness in life    int32
dtype: object

In [14]:
data = music_df.values.tolist()

In [15]:
data[1]

[2, 1, 1, 1, 2, 3, 5, 4, 4, 1, 3, 1, 4, 4, 2, 1, 1, 0, 19, 1, 1, 4]

In [16]:
music_cat = list(music_df.columns[:-PERSONAL_SIZE])
print(music_cat)

['Dance', 'Folk', 'Country', 'Classical music', 'Musical', 'Pop', 'Rock', 'Metal or Hardrock', 'Punk', 'Hiphop, Rap', 'Reggae, Ska', 'Swing, Jazz', 'Rock n roll', 'Alternative', 'Latino', 'Techno, Trance', 'Opera']


In [17]:
from sklearn.model_selection import train_test_split

training_data, test_data = train_test_split(data, test_size=0.33, random_state=25)

In [18]:
print(len(training_data))
print(len(test_data))

615
303


In [19]:
def split_list(a_list, index):
    return a_list[:index], a_list[index:]

In [20]:
def create_y(y):
    new_list = []
    for values in y:
        row = []
        for i, value in enumerate(values):
            row.append({music_cat[i]: value})
        new_list.append(row)

    result_list = []
    for row in new_list:
        winner = None
        for col in row:

            for k, v in col.items():
                if not winner:
                    winner = (k, v)
                elif winner[1] < v:
                    winner = (k, v)
        result_list.append(winner[0])    
    return result_list

In [21]:
train_x = [split_list(l, -PERSONAL_SIZE)[1] for l in training_data] 
train_y = create_y([split_list(l, -PERSONAL_SIZE)[0] for l in training_data])

#train_y = [[{music_cat[i]: value}for values in train_y] for i, value in enumerate(values)]
print(len(train_x))
print(len(train_y))
print(train_x[0])
print(train_y[0])

615
615
[0, 16, 1, 3, 3]
Country


In [22]:
print(len(train_y))

615


In [23]:
test_x = [split_list(l, -PERSONAL_SIZE)[1] for l in test_data] 
test_y = create_y([split_list(l, -PERSONAL_SIZE)[0] for l in test_data])

print(len(test_x))
print(len(test_y))
print(test_x[0])
print(test_y[0])

303
303
[1, 26, 1, 4, 4]
Musical


In [24]:
from sklearn.svm import SVC

clf_svm = SVC(kernel='linear')
clf_svm.fit(train_x, train_y)

SVC(kernel='linear')

In [25]:
print(clf_svm.score(test_x, test_y))

0.2508250825082508
