#### imbalanced data
A data set is called imbalanced if it contains many more samples from one class than from the rest of the classes. The common example of such dataset is credit card fraud. classifiers can have good accuracy on the majority class but very poor accuracy on the minority class(es) due to the influence that the larger majority class. <br>
[more about it](https://towardsdatascience.com/comparing-different-classification-machine-learning-models-for-an-imbalanced-dataset-fdae1af3677f).

In [1]:

import time
import multiprocessing 

def basic_func(x):
    if x == 0:
        return 'zero'
    elif x%2 == 0:
        return 'even'
    else:
        return 'odd'

def multiprocessing_func(x):
    y = x*x
    time.sleep(2)
    print('{} squared results in a/an {} number'.format(x, basic_func(y)))
    
if __name__ == '__main__':
    starttime = time.time()
    processes = []
    for i in range(0,10):
        p = multiprocessing.Process(target=multiprocessing_func, args=(i,))
        processes.append(p)
        p.start()
        
    for process in processes:
        process.join()
        
    print('That took {} seconds'.format(time.time() - starttime))

0 squared results in a/an zero number
1 squared results in a/an odd number
2 squared results in a/an even number
3 squared results in a/an odd number
4 squared results in a/an even number
5 squared results in a/an odd number
6 squared results in a/an even number
7 squared results in a/an odd number
8 squared results in a/an even number
9 squared results in a/an odd number
That took 2.0328800678253174 seconds


In [2]:
import numpy as np
np.__version__

'1.17.2'

In [3]:
u = [7,'you']
print(type(u))
u[1]
y = np.array(u)
print(y) # its type casting 

<class 'list'>
['7' 'you']


In [4]:
#np.random.seed(0) # dont comment it if you want same ans every time

x1 = np.random.randint(10, size=(2,3)) #one dimension
x2 = np.random.randint(10, size=(3,2)) #one dimension
print(x1)
print(x2)

[[7 0 9]
 [8 7 2]]
[[4 2]
 [7 4]
 [9 3]]


In [5]:
np.random.normal(0 , 1, (2,3)) # mean, varience , size

array([[-0.0990949 ,  2.14213294,  0.93325442],
       [-0.62457672, -0.65253535, -0.87484735]])

In [6]:
np.arange(0, 20, 2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [7]:
np.linspace(0,10,6)

array([ 0.,  2.,  4.,  6.,  8., 10.])

In [8]:
#identity
np.eye(6)

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [9]:
x1 = np.arange(10)
print(x1)
print(x1[-1]) # a way access last element

[0 1 2 3 4 5 6 7 8 9]
9


In [10]:
grid = np.array([[1,2,3],[4,5,6]])
np.concatenate([grid,grid]) #default axis=0 column wise

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [11]:
np.concatenate([grid,grid],axis=1) # similarly np.hstack ,np.vsatck works

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [12]:
#shape and reshape
i = np.shape(grid)
o = grid.reshape((6,1))
print(o,i)

[[1]
 [2]
 [3]
 [4]
 [5]
 [6]] (2, 3)


In [26]:
import pandas as pd
from sklearn.metrics import accuracy_score
pd.__version__

'0.25.1'

In [14]:
data = pd.DataFrame({'Country': ['Russia','Colombia','Chile','Equador','Nigeria'],
                    'Index':[11,None,15,15,71]}) # know none
data

Unnamed: 0,Country,Index
0,Russia,11.0
1,Colombia,
2,Chile,15.0
3,Equador,15.0
4,Nigeria,71.0


In [15]:
print(data.describe())
(data.sort_values(by=['Index','Country'],ascending=[True,True]))

           Index
count   4.000000
mean   28.000000
std    28.728615
min    11.000000
25%    14.000000
50%    15.000000
75%    29.000000
max    71.000000


Unnamed: 0,Country,Index
0,Russia,11.0
2,Chile,15.0
3,Equador,15.0
4,Nigeria,71.0
1,Colombia,


In [16]:
train  = pd.read_csv("~/PRML/Data/job_applicants_train.csv")
test  = pd.read_csv("~/PRML/Data/job_applicants_test.csv")

In [17]:
#train.info()
#train.head()
#train

In [18]:
#using random forest

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

data = np.loadtxt(r"/home/subbu/PRML/Data/syn_data_2d.txt",usecols=range(2),delimiter=',',skiprows=0)
target = np.loadtxt(r"/home/subbu/PRML/Data/syn_data_2d.txt",usecols=[2],delimiter=',',skiprows=0)
test_data = np.loadtxt(r"/home/subbu/PRML/Data/syn_test_2d.txt",usecols=range(2),delimiter=',',skiprows=0)
test_target = np.loadtxt(r"/home/subbu/PRML/Data/syn_test_2d.txt",usecols=[2],delimiter=',',skiprows=0)
# similarly genfromtxt with other features



In [19]:
data,target

(array([[-11.103  ,   6.5635 ],
        [-10.393  ,   1.9807 ],
        [-13.282  ,   5.7771 ],
        ...,
        [ -6.5125 ,   0.88217],
        [ -9.2253 ,   0.19239],
        [ -4.1114 ,  -2.9822 ]]), array([1., 1., 1., ..., 2., 2., 2.]))

In [20]:
clf = RandomForestClassifier(max_depth=100, random_state=5,n_estimators=100)

In [21]:
clf.fit(data,target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

In [22]:
preds = clf.predict(test_data)
pd.crosstab(test_target, preds, rownames=['Actual'], colnames=['Predicted'])

Predicted,1.0,2.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,499,1
2.0,7,493


In [27]:
# print(preds)
accuracy_score(test_target,preds)

0.992

In [None]:
data=pd.read_csv('func/val.txt',delimiter=' ',header=None) jlcebkuy