In [1]:
##Imports##
__author__ = 'bdyetton'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

#Load and prepare data
all_charts = pd.read_csv('BillboardLyricData.txt', sep='\t', encoding='utf-8')
all_charts = all_charts.dropna() ## Remove missing data

class_mapping = {label:idx for idx,label in enumerate(np.unique(all_charts.chart))}
y_raw = all_charts.chart.map(class_mapping)

vectorizer = CountVectorizer(max_df=0.95, min_df=2,max_features=1000,stop_words='english')                              
vectorizer.fit(all_charts.lyrics) #Creates the dictionary to convert lyrics to counts
word_vector = vectorizer.transform(all_charts.lyrics) #Does actual conversion
X_raw = word_vector.todense()

#Split of test set, and set aside (dont touch until after we have found the best hyperparameters)
X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.3)

model_2 = SGDClassifier(loss='log', n_iter=100, penalty='none')
model_2.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=100, n_jobs=1,
       penalty='none', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [28]:
## Extract weights:
coef_weights = model_2.coef_

#We dont care if the presence or the absence of this word is predictive
absolute_coef_weights = np.absolute(coef_weights)

## Find index of higest weights (-ve because we want highest to lowest)
absolute_coef_idx_sorted = np.argsort(-absolute_coef_weights, axis=1)

## Slice off the array after the first 10 to give the index of the highest 10 weights
ten_highest_idxs = absolute_coef_idx_sorted[:, :10]

## Get the words we used as features. 
feature_words = np.array(vectorizer.get_feature_names())

## Create map from label_ints to label_strings
inv_class_mapping = {v: k for k, v in class_mapping.items()}

##create a for loop that will run through each row to pull the words that correspond with the argsort place in ten_most_words
for label_int in y_raw.unique():
    print('\nGenre is',inv_class_mapping[label_int])
    row = ten_highest_idxs[label_int]
    coefs = coef_weights[label_int,row]
    to_print = np.vstack([feature_words[row],np.round(coefs)])
    print('Ten most predicitive words are:\n', to_print.transpose())
    


Genre is hot-holiday-songs
Ten most predicitive words are:
 [['doo' '-140.0']
 ['ooh' '-110.0']
 ['snow' '102.0']
 ['christmas' '92.0']
 ['nah' '-85.0']
 ['really' '-70.0']
 ['santa' '69.0']
 ['feel' '-66.0']
 ['hey' '-59.0']
 ['love' '-58.0']]

Genre is christian-songs
Ten most predicitive words are:
 [['cruisin' '106.0']
 ['wicked' '-106.0']
 ['family' '95.0']
 ['sorry' '-82.0']
 ['shit' '-78.0']
 ['drank' '-77.0']
 ['grace' '76.0']
 ['human' '-76.0']
 ['friends' '73.0']
 ['lord' '72.0']]

Genre is country-songs
Ten most predicitive words are:
 [['fame' '-175.0']
 ['ayy' '-174.0']
 ['doo' '-157.0']
 ['nah' '-145.0']
 ['lovin' '-127.0']
 ['wow' '-96.0']
 ['la' '-85.0']
 ['nigga' '-83.0']
 ['live' '-82.0']
 ['ooh' '-81.0']]

Genre is rock-songs
Ten most predicitive words are:
 [['blow' '-170.0']
 ['fame' '148.0']
 ['bun' '-119.0']
 ['ayo' '112.0']
 ['mum' '-100.0']
 ['pipe' '-97.0']
 ['family' '-85.0']
 ['rebel' '84.0']
 ['singing' '-79.0']
 ['ey' '-75.0']]

Genre is pop-songs
Ten mos