In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

%matplotlib inline
# doubled the males data set
# df = pd.read_csv("./data-labled/songs-gender-lable-balanced.csv")
# df = df.drop(["Year"], axis=1)
# df = df[df['dominant_gender'] != "neutral"]
# df = df.dropna()
df = pd.read_csv('./data-labled/songs-femal-male-lable.csv')
print(df.head())

In [None]:
# transverting categories into numerical data

df['category_id'] = df['dominant_gender'].factorize()[0]
category_id_df = df[['dominant_gender', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'dominant_gender']].values)

print(df.head(3))

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LogisticRegression
from sklearn.model_selection import cross_val_score
import seaborn as sns

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1',  lowercase=True, ngram_range=(3, 4), stop_words='english')
features = tfidf.fit_transform(df.Lyrics.values.astype('U')).toarray()
labels = df.category_id
features.shape

#find the terms which are most correlated by each gender
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for dominant_gender, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(dominant_gender))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
  print("\n")

In [None]:
model = LinearSVC()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.21, random_state=0)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.dominant_gender.values, yticklabels=category_id_df.dominant_gender.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# y_pred = tfidf.tranform("this is an example")
# model.predict(y_pred)

In [None]:
from IPython.display import display
for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 10:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
      display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['dominant_gender', 'Lyrics']])
      print('')

In [None]:

def plot_coefficients(classifier, feature_names, top_features=20):
 coef = classifier.coef_.ravel()
 top_positive_coefficients = np.argsort(coef)[-top_features:]
 top_negative_coefficients = np.argsort(coef)[:top_features]
 top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
 # create plot
 plt.figure(figsize=(15, 5))
 colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
 plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
 feature_names = np.array(feature_names)
 plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
 plt.show()

plot_coefficients(model, tfidf.get_feature_names())


In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, target_names=df['dominant_gender'].unique()))

In [None]:
with open("model.pkl","wb") as fout:
    pickle.dump(model,fout)
    
with open("tfidf.pkl","wb") as fout:
    pickle.dump(tfidf,fout)