# Gender Recognition by Voice Machine Learning SVM

### Used Libraries
1. NumPy (Numerical Python)
2. Pandas
3. Matplotlib
4. Seaborn
5. Sckit learn
6. Missingno

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Content:
1. Pandas Profiling Report
2. Missingo - Missing Data
3. Seaborn - Heatmap
4. Separating Features and Labels
5. Converting String Value To int Type for Labels
6. Data Standardisation
7. Splitting Dataset into Training Set and Testing Set
8. Build SVM Model with Default Hyperparameter
9. Accuracy Score
10. Confusion Matrix
11. F1 Score

### Reading Data

In [2]:
df=pd.read_csv("voice.csv")

In [3]:
df.sample(5)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
3056,0.205813,0.04265,0.213778,0.198156,0.228381,0.030224,2.679823,10.054135,0.849874,0.261366,...,0.205813,0.206274,0.049505,0.243902,0.451172,0.195312,0.864258,0.668945,0.405283,female
3049,0.196159,0.044349,0.202957,0.192696,0.216348,0.023652,3.466938,17.390557,0.851043,0.293635,...,0.196159,0.198551,0.084746,0.232558,0.442708,0.195312,0.820312,0.625,0.572368,female
1287,0.183706,0.064713,0.181856,0.131701,0.246804,0.115103,1.439254,4.996871,0.933255,0.516996,...,0.183706,0.130354,0.046921,0.27907,1.363511,0.023438,6.84375,6.820312,0.155708,male
1140,0.188422,0.064872,0.170708,0.13734,0.255337,0.117997,1.844067,6.104316,0.902385,0.443069,...,0.188422,0.144218,0.050209,0.277457,0.967797,0.023438,8.460938,8.4375,0.058406,male
2752,0.196909,0.04196,0.193667,0.174611,0.223222,0.048611,1.820306,6.072823,0.88723,0.329733,...,0.196909,0.17313,0.049793,0.275862,1.056152,0.023438,9.328125,9.304688,0.085408,female


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 21 columns):
meanfreq    3168 non-null float64
sd          3168 non-null float64
median      3168 non-null float64
Q25         3168 non-null float64
Q75         3168 non-null float64
IQR         3168 non-null float64
skew        3168 non-null float64
kurt        3168 non-null float64
sp.ent      3168 non-null float64
sfm         3168 non-null float64
mode        3168 non-null float64
centroid    3168 non-null float64
meanfun     3168 non-null float64
minfun      3168 non-null float64
maxfun      3168 non-null float64
meandom     3168 non-null float64
mindom      3168 non-null float64
maxdom      3168 non-null float64
dfrange     3168 non-null float64
modindx     3168 non-null float64
label       3168 non-null object
dtypes: float64(20), object(1)
memory usage: 519.8+ KB


# Pandas Profiling Report


In [None]:
report = pp.ProfileReport(df)

report.to_file("report.html")

report

# Missingno - Missing Data

In [None]:
import missingno as msno
msno.matrix(df)
plt.show()

# Seaborn - Heatmap
### Relationship between columns
* 01 --> Direct proportion
* 00 --> No relationship
* -1 --> İnverse proportion

In [None]:
f,ax = plt.subplots(figsize=(25, 15))
sns.heatmap(df.corr(), annot=True, linewidths=0.5,linecolor="red", fmt= '.1f',ax=ax)
plt.show()

# Separating Features and Labels

In [None]:
X=df.iloc[:, :-1]
X.head()

# Converting String Value To int Type for Labels
### Encode label category
* Male -> 1
* Female -> 0

In [None]:
from sklearn.preprocessing import LabelEncoder
y=df.iloc[:,-1]

encoder = LabelEncoder()
y = encoder.fit_transform(y)
print(y)

# Data Standardisation
### Scale the data to be between -1 and 1

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

# Splitting Dataset into Training Set and Testing Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Build SVM Model with Default Hyperparameter

In [None]:
from sklearn.svm import SVC
from sklearn import metrics
svc=SVC() #Default hyperparameters
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)

# Accuracy Score

In [None]:
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# F1 Score

In [None]:
from sklearn.metrics import  f1_score
f1_score = f1_score(y_test, y_pred)
print("F1 Score:")
print(f1_score)

# Thank You

If you have any suggestion or advice or feedback, I will be very appreciated to hear them.
### Also there are other visualization kernels
* [FIFA 19 Player Data Analysis and Visualization EDA](https://www.kaggle.com/ismailsefa/f-fa-19-player-data-analysis-and-visualization-eda)
* [Crimes Data Analysis and Visualzation (EDA)](https://www.kaggle.com/ismailsefa/crimes-data-analysis-and-visualzation-eda)
* [Google Play Store Apps Data Analysis (EDA)](https://www.kaggle.com/ismailsefa/google-play-store-apps-data-analysis-eda)
* [World Happiness Data Analysis and Visualization](https://www.kaggle.com/ismailsefa/world-happiness-data-analysis-and-visualization)
* [Used Cars Data Analysis and Visualization (EDA)](https://www.kaggle.com/ismailsefa/used-cars-data-analysis-and-visualization-eda)
* [Gender Recognition by Voice Machine Learning SVM](https://www.kaggle.com/ismailsefa/gender-recognition-by-voice-machine-learning-svm)