<a href="https://colab.research.google.com/github/Source-Code777/Machine_Learning_Projects/blob/main/Counselling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **GATHERING DATA USING PANDAS WEB_SCRAPPER FROM WBJEE_WEBSITE**

In [None]:
import pandas as pd
url_23='https://admissions.nic.in/wbjeeb/Applicant/report/orcrreport.aspx?enc=b6w3EPyuw0C4FADZ4v1XmYUz0XFq314fzLjkE3wbM2xr/DbsjpvUS9LBCKXjSeSL'
tables_23=pd.read_html(url_23)
url_24='https://admissions.nic.in/wbjeeb/Applicant/report/orcrreport.aspx?enc=Nm7QwHILXclJQSv2YVS+7l8OpFY/O746kfneOXEneV50mv1B/txHsSKB11hFlsvw'
tables_24=pd.read_html(url_24)

**CREATING DATAFRAMES YEAR-WISE**

In [None]:
df_23=tables_23[0]
df_24=tables_24[0]

In [None]:
df_23.sample(5)

**RE-NAMING THE FEATURES **

In [None]:
def clean_table(df, year):

    df.columns = df.columns.str.strip()
    rename_map = {
        "Institute": "College Name",
        "Institute Name": "College Name",
        "Program": "Program",
        "Branch": "Program",
        "Quota": "Quota",
        "Seat Pool": "Quota",
        "Category": "Category",
        "Opening Rank": "Opening Rank",
        "Closing Rank": "Closing Rank",
        "Round": "Round"
    }
    df = df.rename(columns={col: rename_map.get(col, col) for col in df.columns})
    df["Year"] = year
    return df

 **ADDING YEAR COLUMN IN THE DATAFRAME**

In [None]:
df_23 = clean_table(df_23, 2023)
df_24 = clean_table(df_24, 2024)

**CONACATENATING THE DATA-FRAMES**

In [None]:
df = pd.concat([df_23, df_24], ignore_index=True)

df["Opening Rank"] = pd.to_numeric(df["Opening Rank"], errors="coerce")
df["Closing Rank"] = pd.to_numeric(df["Closing Rank"], errors="coerce")

print(df.shape)
print(df.head())

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df=df.drop(['Sr.No'],axis=1)

In [None]:
df.sample(5)

In [None]:
df['Category'].unique()

In [None]:
df['Stream'].unique()

In [None]:
df['Quota'].unique()

In [None]:
!pip install ydata-profiling

# **DATA-VISUALIZATION USING PANDAS PROFILER**

In [None]:
import pandas as pd
from ydata_profiling import ProfileReport

In [None]:
profile = ProfileReport(df, title="My DataFrame Profile")
profile.to_file("report.html")
from IPython.display import HTML
HTML(filename='report.html')

PROBLEMS DETECTED IN THE DATA-:

1.Category is highly overall correlated with Quota	High correlation.

2.Closing Rank is highly overall correlated with Opening Rank and 1 other fields	High correlation

3.Opening Rank is highly overall correlated with Closing Rank and 1 other fields	High correlation

4.Quota is highly overall correlated with Category and 1 other fields	High correlation

5.Seat Type is highly overall correlated with Closing Rank and 3 other fields	High correlation

6.Stream is highly overall correlated with Seat Type and 1 other fields	High correlation

7.Year is highly overall correlated with Stream

In [None]:
df.sample(5)

# **DATA PREPROCESSING**

In [None]:
df['Round']=df['Round'].str.replace('Round ', '').astype(int)
df.head()

In [None]:
df['Program'].unique()

In [None]:
df['Program'].duplicated()

In [None]:
tfw_programs = df[df['Program'].str.contains('TFW')]['Program'].unique()
display(tfw_programs)

In [None]:
df['Program'] = df['Program'].str.replace('TFW', '').str.strip()
display(df.head())

In [None]:
df['Program'].unique()

# **CLEANING THE PROGRAM COLUMN**

In [None]:
import re
df['Program'] = df['Program'].str.replace(',', '').str.replace('.', '', regex=False)
df['Program'] = df['Program'].apply(lambda x: re.sub(r'\(.*\)', '', x)).str.strip()
display(df['Program'].unique())

In [None]:
display(df['Program'].unique())

In [None]:
computer_programs = df[df['Program'].str.contains('COMPUTER')]
display(computer_programs)

In [None]:
display(df['Program'].unique())

# **CREATING A MAPPING FUNCTION AND APPLYING IT ON PROGRAM COLUMN**

In [None]:
keywords = {
    'COMPUTER': 'CSE',
    'INFORMATION TECHNOLOGY': 'IT',
    'DATA SCIENCE': 'DS',
    'ARTIFICIAL INTELLIGENCE': 'AI',
    'CIVIL': 'CE',
    'MECHANICAL': 'ME',
    'ELECTRICAL': 'EE',
    'ELECTRONICS': 'ECE',
    'BIOMEDICAL': 'BME',
    'BIOTECH': 'BT',
    'CHEMICAL': 'CHE',
    'CERAMIC': 'CER',
    'METALLURGICAL': 'MME',
    'TEXTILE': 'TT',
    'FOOD': 'FT',
    'DAIRY': 'DT',
    'AGRICULTURAL': 'AG',
    'MARINE': 'MRE',
    'MINING': 'MN',
    'PHARM': 'PHARM',
    'ARCHITECTURAL': 'ARCH',
    'AUTOMOBILE': 'AUTO',
    'ROBOT': 'ROB',
    'PLANNING': 'BP',
    'PRODUCTION': 'PR'
}

In [None]:
def map_branch(x):
  x = str(x).strip().upper()
  x = x.replace("&", "AND")
  x = x.replace("/", " ")
  x = x.replace("-", " ")
  x = x.replace("  -", " ")
  for k,v in keywords.items():
    if k in x:
      return v
  return 'OTHER'

In [None]:
df['Program_Ref']=df['Program'].apply(map_branch)

In [None]:
df = df.drop(columns=["Stream"],axis=1)
df=df.drop(['Program'],axis=1)

In [None]:
df.sample(5)

In [None]:
df.isna().sum()

# **VISUALIZING THE DATA**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x="Year", y="Closing Rank", data=df)
plt.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
top_colleges = df.groupby("College Name")["Closing Rank"].median().sort_values().head(10)
sns.barplot(x=top_colleges.values, y=top_colleges.index)
plt.ticklabel_format(style='plain', axis='x')
plt.title("Top 10 Colleges by Median Closing Rank")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x="Opening Rank", y="Closing Rank", hue="Year", data=df, alpha=0.6)
plt.ticklabel_format(style='plain', axis='both')
plt.title("Opening vs Closing Rank")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.violinplot(x="Category", y="Closing Rank", data=df, scale="width", inner="quartile")
plt.ticklabel_format(style='plain', axis='y')
plt.title("Closing Rank Distribution by Category")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
branch_rank = df.groupby("Program_Ref")["Closing Rank"].mean().sort_values()
sns.barplot(x=branch_rank.values, y=branch_rank.index)
plt.ticklabel_format(style='plain', axis='x')
plt.title("Average Closing Rank by Branch")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x="Year", hue="Program_Ref", data=df, palette="Set2")
plt.title("Branch Popularity Over Years (Seat Allotments)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x="Program_Ref", y="Closing Rank", hue="Year", data=df)
plt.ticklabel_format(style='plain', axis='y')
plt.title("Closing Rank by Branch and Year")
plt.xticks(rotation=45)
plt.show()

# **LET'S DETECT OUTLIERS **

In [None]:
import numpy as np

def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = df[(df[column] < lower) | (df[column] > upper)]
    return outliers, lower, upper

outliers_open, low_open, up_open = detect_outliers(df, "Opening Rank")
outliers_close, low_close, up_close = detect_outliers(df, "Closing Rank")

print("Opening Rank outliers:", len(outliers_open))
print("Closing Rank outliers:", len(outliers_close))

In [None]:
df.sample(5)