<a href="https://colab.research.google.com/github/GDharan10/Dataset4_AdmissionsData_sm.Logit/blob/main/Admissions_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Librarys**

In [1]:
import pandas as pd
import numpy as np

#Statistic
from scipy import stats

#Visualization
import plotly.express as px

#Machine Learning
import statsmodels.api as sm

# **DataFrame**

In [2]:
df = pd.read_csv("/content/Admissions Data.csv")

In [3]:
df.head()

Unnamed: 0,Marks,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male


# **DataFrame observation**


In [4]:
#Dependent column   - Admitted
#Independent column - Marks, Gender

#Continuous - Marks
#Category   - Admitted, Gender

#Supervised Classification

# **Data cleaning using pandas**


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Marks     168 non-null    int64 
 1   Admitted  168 non-null    object
 2   Gender    168 non-null    object
dtypes: int64(1), object(2)
memory usage: 4.1+ KB


In [6]:
df.isnull().sum()

Marks       0
Admitted    0
Gender      0
dtype: int64

# **Hypothesis testing / Statistical analysis**


In [None]:
'''
           Central Limit Theorem
Marks               population mean: 1695.2738095238096 and samples mean: 1696.5268571428574

             1-Tailed Test
Marks               H0 is accepted

                ANOVA
Admitted, Marks     H0 is rejected
Gender, Marks       H0 is rejected

            Chi-Square Test
Admitted, Gender    H0 is rejected
'''

# Correlation

# Central Limit Theorem

In [8]:
population = df.Marks.values
population_mean = population.mean()
samplesize = 35
sample_mean = []
for i in range(25):
  sample = np.random.choice(population,samplesize)
  sample_mean.append(sample.mean())
np.mean(sample_mean)
print(f"population mean: {population_mean} and samples mean:", np.mean(sample_mean)  )

population mean: 1695.2738095238096 and samples mean: 1693.9679999999998


# 1-Tailed Test

In [9]:
H0_accepted = 0
H0_rejected = 0
for i in range(25):
  sample = df.Marks.sample(frac=0.05)
  t_test, p_value = stats.ttest_1samp(sample,df.Marks.mean())
  if p_value > 0.5:
    H0_accepted += 1
  else:
    H0_rejected += 1

print(f"H0 accepted {H0_accepted} times , H0 rejected {H0_rejected} times")
if H0_accepted > H0_rejected:
  print("H0 is accepted, Ha is rejected, There is no significant effect")
else:
  print("H0 is rejected, Ha is accepted, There is a significant effect")

H0 accepted 14 times , H0 rejected 11 times
H0 is accepted, Ha is rejected, There is no significant effect


# **Chi-Square Test**

In [10]:
data = pd.crosstab(df.Admitted, df.Gender)
observed_values = data.values
chi2_stat, p_value, _, _= stats.chi2_contingency(observed_values)
if p_value > 0.05:
  print("H0 is accepted, There is no relationship between two columns we're comparing")
else:
  print("H0 is rejected, There is a relationship between two columns we're comparing")

H0 is rejected, There is a relationship between two columns we're comparing


# ANOVA

In [18]:
group = df.Admitted.unique()
data = {}
for i in group:
  data[i]=df.Marks[df.Admitted == i]
f_value, p_value = stats.f_oneway(*[data[i] for i in group])
if p_value > 0.05:
  print("H0 is accepted, There is a relationship between two columns we're comparing")
else:
  print("H0 is rejected, There is no relationship between two columns we're comparing")

H0 is rejected, There is no relationship between two columns we're comparing


In [19]:
group = df.Gender.unique()
data = {}
for i in group:
  data[i]=df.Marks[df.Gender == i]
f_value, p_value = stats.f_oneway(*[data[i] for i in group])
if p_value > 0.05:
  print("H0 is accepted, There is a relationship between two columns we're comparing")
else:
  print("H0 is rejected, There is no relationship between two columns we're comparing")

H0 is rejected, There is no relationship between two columns we're comparing


# **Data Visualization**



In [13]:
fig = px.sunburst(df,path=['Gender', 'Admitted'],
                    values='Marks')
fig.show()

# **Preprocessing**

In [14]:
df.Gender.unique()

array(['Male', 'Female'], dtype=object)

In [15]:
# Replacing categorical values with numerical equivalents
df["Admitted"]=df["Admitted"].replace({"Yes":1,"No":0})
df['Gender'] = df['Gender'].map({'Female': 201, 'Male': 202})
df

Unnamed: 0,Marks,Admitted,Gender
0,1363,102,202
1,1792,101,201
2,1954,101,201
3,1653,102,202
4,1593,102,202
...,...,...,...
163,1722,101,201
164,1750,101,202
165,1555,102,202
166,1524,102,202


# **Machine Learning**

Supervised Classification - Logistic regression


In [16]:
# 1 data availability
# 2 separating independent and dependent
# 3 identifying algorithms/Model
# 4 training
# 5 evaluation

In [24]:
df.head()

Unnamed: 0,Marks,Admitted,Gender
0,1363,0,202
1,1792,1,201
2,1954,1,201
3,1653,0,202
4,1593,0,202


In [25]:
y = df["Admitted"]
x1 = df[["Marks",	"Gender"]]
x = sm.add_constant(x1)
result = sm.Logit(y, x).fit()
result.pred_table()

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


array([[69.,  5.],
       [ 4., 90.]])

In [26]:
cm_df=pd.DataFrame(result.pred_table())
cm=np.array(cm_df)
acc=(cm[0,0]+cm[1,1])/cm.sum()
print(acc*100)

94.64285714285714


In [27]:
result.summary()

0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Sat, 30 Mar 2024",Pseudo R-squ.:,0.8249
Time:,01:41:35,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,324.5189,168.673,1.924,0.054,-6.075,655.113
Marks,0.0406,0.010,4.129,0.000,0.021,0.060
Gender,-1.9449,0.846,-2.299,0.022,-3.603,-0.287


In [28]:
cm_df=pd.DataFrame(result.pred_table())
cm_df.columns=["predicted 0","predicted 1"]
cm_df=cm_df.rename(index={0:"actual 0",1:"actual 1"})
cm_df

Unnamed: 0,predicted 0,predicted 1
actual 0,69.0,5.0
actual 1,4.0,90.0
