In [1]:
import pandas as pd
import numpy as np

# Problem Statement:

You work in XYZ Company as a Python developer. The company officials want
you to build a Python program.

### Tasks To Be Performed:

1. Write a function that takes start and end of a range returns a pandas series object containing numbers within that range.  
In case the user does not pass start or end or both they should default to 1 and 10 respectively.   
E.g:->  
range_series() -> Should Return a pandas series from 1 to 10  
range_series(5) -> Should Return a pandas series from 5 to 10  
range_series(5, 15) -> Should Return a pandas series from 5 to 15  
Create a method that takes n NumPy arrays of the same dimensions, sums them and returns the answer.

In [2]:
def range_series(start = 1, end = 10):
    'Create a series with step as 1 by giving the start(default = 1) and end(default = 1) of a range'
    try:
        start = int(start)
        end = int(end)
    except:
        raise ValueError('Unable to interpret given values as integers')
    else:
        if end >= start: 
            step = 1 
        else: 
            step = -1
        return pd.Series(range(start, end + step, step))    

In [3]:
class ShapeError(Exception):
    pass
class LengthError(Exception):
    pass

def sum_arrays(*args):
    '''Return sum of any number of numeric numpy arrays of same dimension'''
    if len(args) != 0:
        args_new = [np.array(val) for val in args]
    else:
        raise LengthError('No parameters were passed') 
    
    array_shape = args_new[0].shape
    sum_arr = np.zeros(array_shape)
    
    for elem in args_new:
        if elem.shape != array_shape:
            raise ShapeError('Arrays of unequal shapes passed. Unable to proceed.')
        try:
            np.add(sum_arr, elem, out = sum_arr)
        except np.core._exceptions.UFuncTypeError:
            raise TypeError('Unable to sum elements of different type')
    return sum_arr

In [4]:
range_series(5)

0     5
1     6
2     7
3     8
4     9
5    10
dtype: int64

In [5]:
sum_arrays([True, False], [1,2])

array([2., 2.])

2. Create a function that takes in two lists named keys and values as arguments  
Keys would be strings and contain n string values  
Values would be a list containing n lists  
The methods should return a new pandas DataFrame with keys as column names and values as their corresponding values, e.g:
-> create_dataframe(["One", "Two"], [["X", "Y"], ["A", "B"]])   
-> should return a data frame  
One Two  
0 X A  
1 Y B  

In [6]:
def create_dataframe(keys, values):
    return pd.DataFrame({key:value for key, value in zip(keys, values)})

In [7]:
create_dataframe(["One", "Two"], [["X", "Y"], ["A", "B"]])

Unnamed: 0,One,Two
0,X,A
1,Y,B


3. Create a function that concatenates two DataFrames. Use a previously created function to create two DataFrames and pass them as parameters
Make sure that the indexes are reset before returning.

In [8]:
def df_concatenate(df1, df2, axis = 0, join = 'outer'):
    return pd.concat([df1, df2], axis = axis, join = join, ignore_index = True)

In [9]:
df1 = create_dataframe(["One", "Two"], [["X", "Y"], ["A", "B"]])
df2 = create_dataframe(["One", "Three"], [["E", "F"], ["J", "K"]])
df_concatenate(df1, df2)

Unnamed: 0,One,Two,Three
0,X,A,
1,Y,B,
2,E,,J
3,F,,K


4. Write code to load data from cars.csv into a dataframe and print its details.
Details like: 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'.

In [10]:
cars = pd.read_csv('cars.csv')

In [13]:
cars

Unnamed: 0,S.No,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,1,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,2,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,3,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,4,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,5,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,6,Valiant,18.1,6,225.0,105,2.76,3.46,,1,0,3,1
6,7,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,8,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,9,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,10,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [11]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   S.No    32 non-null     int64  
 1   model   32 non-null     object 
 2   mpg     32 non-null     float64
 3   cyl     32 non-null     int64  
 4   disp    32 non-null     float64
 5   hp      32 non-null     int64  
 6   drat    32 non-null     float64
 7   wt      32 non-null     float64
 8   qsec    29 non-null     float64
 9   vs      32 non-null     int64  
 10  am      32 non-null     int64  
 11  gear    32 non-null     int64  
 12  carb    32 non-null     int64  
dtypes: float64(5), int64(7), object(1)
memory usage: 3.4+ KB


In [15]:
cars.describe()

Unnamed: 0,S.No,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,29.0,32.0,32.0,32.0,32.0
mean,16.5,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.674828,0.4375,0.40625,3.6875,2.8125
std,9.380832,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.780394,0.504016,0.498991,0.737804,1.6152
min,1.0,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,8.75,15.425,4.0,120.825,96.5,3.08,2.58125,16.87,0.0,0.0,3.0,2.0
50%,16.5,19.2,6.0,196.3,123.0,3.695,3.325,17.42,0.0,0.0,4.0,2.0
75%,24.25,22.8,8.0,326.0,180.0,3.92,3.61,18.6,1.0,1.0,4.0,4.0
max,32.0,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


5. Write a method that will take a column name as argument and return the name of the column with which the given column has the highest
correlation. The data to be used is the cars dataset. The returned value should not be the column name that was passed as the
parameters,   
e.g. : get_max_correlated_column('mpg') -> should return 'drat  

In [25]:
def get_max_correlated_column(col):
    global cars
    columns = list(cars.columns)
    if col not in columns:
        print(f"The given column {col} was not found in cars table")
        return 
    corr_matrix = cars.corr(numeric_only = True)
    if col not in corr_matrix.columns:
        print(f"The given column {col} is not a numeric column in cars table")
        return
    return corr_matrix.drop(index = col)[col].idxmax() 
    

In [29]:
get_max_correlated_column('mpg')

'drat'