### Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from dateutil import parser

### Importing the data

In [2]:
raw_train = pd.read_csv('train.csv')
raw_test = pd.read_csv('test_x.csv')

train_df = raw_train.copy()
test_df = raw_test.copy()

  raw_train = pd.read_csv('train.csv')


### Functions

In [3]:
def analyze_dataframe(df):
    """
    Function to analyze a DataFrame by printing:
    - Shape of the DataFrame
    - Information about each column (non-null count and data type)
    - Descriptive statistics for numerical columns
    - Number of missing values in each column
    - List of all column names

    Parameters:
    df (pd.DataFrame): The DataFrame to analyze
    """
    # Print shape of the DataFrame
    print("SHAPE")
    print(df.shape)
    print("\n----------------------------------------------------\n")
    
    # Print column information (non-null count, data types)
    print(df.info())
    print("\n----------------------------------------------------\n")
    
    # Print descriptive statistics for numerical columns
    print(df.describe())
    print("\n----------------------------------------------------\n")
    
    # Print count of missing values in each column
    print(df.isna().sum())
    print("\n----------------------------------------------------\n")
    
    # Print all column names
    print(df.columns)

In [4]:
def analyze_column(df, column_name):
    """
    Function to analyze a column in a DataFrame by printing:
    - Unique values
    - Value counts (including NaNs)
    - Number of missing values (NaNs)

    Parameters:
    df (pd.DataFrame): The DataFrame containing the column
    column_name (str): The name of the column to analyze
    """
    # Print unique values
    print(f"Unique values in '{column_name}':")
    print(df[column_name].unique())
    print(' ')
    
    # Print value counts (including NaNs)
    print(f"Value counts in '{column_name}' (including NaNs):")
    print(df[column_name].value_counts(dropna=False))
    print(' ')
    
    # Print the number of missing values (NaNs)
    print(f"Number of missing values (NaNs) in '{column_name}':")
    print(df[column_name].isna().sum())
    
    print(' ')
    print(df[column_name].describe())

In [5]:
def standardize_dates(df, date_column):
    """
    Standardizes date formats in the specified column of the DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the date column.
    date_column (str): The name of the column containing dates in various formats.

    Returns:
    pd.DataFrame: DataFrame with the standardized date column.
    """
    def parse_date(date_str):
        try:
            return parser.parse(date_str, dayfirst=True)
        except (ValueError, TypeError):
            return pd.NaT

    # Apply the parsing function
    df['Parsed Date'] = df[date_column].apply(parse_date)

    # Ensure 'Parsed Date' column is of datetime type
    df['Parsed Date'] = pd.to_datetime(df['Parsed Date'], errors='coerce')

    # Convert to standard format
    df['Standardized Date'] = df['Parsed Date'].dt.strftime('%Y-%m-%d')

    # Replace original column and clean up
    df[date_column] = df['Standardized Date']
    df = df.drop(columns=['Parsed Date', 'Standardized Date'])
    
    return df

In [6]:
def plot_boxplot_with_outliers(df, column_name):
    """
    Plots a boxplot of the specified column, highlighting the outliers.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column_name (str): The name of the column to plot.
    """
    # Create a boxplot
    plt.figure(figsize=(10, 12))
    sns.boxplot(data=df, y=column_name)
    
    # Add title and labels
    plt.title(f'Boxplot of {column_name}')
    plt.ylabel(column_name)
    
    # Show the plot
    plt.show()

In [7]:
def list_outlier_values(df, column_name):
    """
    Lists outlier values in a specified column of a DataFrame using the IQR method.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column_name (str): The name of the column to check for outliers.

    Returns:
    list: A list of outlier values in the specified column.
    """
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    
    # Calculate the Interquartile Range (IQR)
    IQR = Q3 - Q1
    
    # Determine the outlier boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Detect outlier values
    outlier_values = df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)][column_name].tolist()
    
    return outlier_values

### 1-) Exploratory Data Analysis

In [8]:
train_df.head()

Unnamed: 0,Basvuru Yili,Degerlendirme Puani,Cinsiyet,Dogum Tarihi,Dogum Yeri,Ikametgah Sehri,Universite Adi,Universite Turu,Burslu ise Burs Yuzdesi,Burs Aliyor mu?,...,Spor Dalindaki Rolunuz Nedir?,Aktif olarak bir STK üyesi misiniz?,Hangi STK'nin Uyesisiniz?,Stk Projesine Katildiniz Mi?,Girisimcilikle Ilgili Deneyiminiz Var Mi?,Girisimcilikle Ilgili Deneyiminizi Aciklayabilir misiniz?,Ingilizce Biliyor musunuz?,Ingilizce Seviyeniz?,"Daha Önceden Mezun Olunduysa, Mezun Olunan Üniversite",id
0,2014,52.0,Erkek,4/6/1994,"Altindag, Ankara",Ankara,İHSAN DOĞRAMACI BİLKENT,Özel,100.0,Evet,...,Diğer,Hayır,,Evet,Hayır,,,,,0
1,2014,30.0,Erkek,6/11/1993,Üsküdar,İstanbul,İHSAN DOĞRAMACI BİLKENT,Özel,100.0,Hayır,...,0,Hayır,,Evet,Evet,Bilkent Cyberparkta bir şirkette Türkiye nin i...,,,,1
2,2014,18.0,Erkek,1/15/1986,Samsun,İstanbul,ULUSLARARASI KIBRIS ÜNİVERSİTESİ,Özel,100.0,Hayır,...,0,Hayır,,Hayır,Hayır,,,,,2
3,2014,40.0,Erkek,6/4/1991,Diyarbakır,İstanbul,İSTANBUL ŞEHİR ÜNİVERSİTESİ,Özel,100.0,Evet,...,0,Hayır,,Evet,Hayır,,,,,3
4,2014,24.0,Erkek,2 Kasim 1992,Ankara/Altındağ,Ankara,TURGUT ÖZAL ÜNİVERSİTESİ,Özel,100.0,Evet,...,0,Hayır,,Hayır,Hayır,,,,,4


In [9]:
analyze_dataframe(train_df)

SHAPE
(65125, 44)

----------------------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65125 entries, 0 to 65124
Data columns (total 44 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Basvuru Yili                                               65125 non-null  int64  
 1   Degerlendirme Puani                                        65124 non-null  float64
 2   Cinsiyet                                                   64956 non-null  object 
 3   Dogum Tarihi                                               64948 non-null  object 
 4   Dogum Yeri                                                 64334 non-null  object 
 5   Ikametgah Sehri                                            63088 non-null  object 
 6   Universite Adi                                             64993 non-null  object 
 7   Universite Turu      

## 2-) Preproccessing 

### 2.1-) Degerlendirme Puani

In [10]:
analyze_column(train_df, 'Degerlendirme Puani')

Unique values in 'Degerlendirme Puani':
[ 52.  30.  18.  40.  24.  37.  31.  28.  26.  58.  34.  33.  48.  10.
   8.  54.  14.  44.  60.  42.  22.  16.  29.  46.  41.  50.  53.  36.
  32.  38.  20.  12.  51.  65.  56.  61.  62.  49.  70.  43.  23.  64.
  59.  57.  63.  71.  98.  21.  66.  27.  39.  68.  85.  45.  67.  47.
  78.  19.  13.  90.  72.  77.  55.   6.  79.  76.  75.  74.  73.  17.
  80.  84.  86.  82.  88.  92.  94.  96. 102.   7.   4.  15.  35.  25.
   5.  11.   9.   3.   2.   1.   0.  69.  81.  89.  83. 100.  95.  nan]
 
Value counts in 'Degerlendirme Puani' (including NaNs):
Degerlendirme Puani
28.0     2063
22.0     2041
18.0     1995
24.0     1785
12.0     1740
         ... 
83.0        2
100.0       2
89.0        1
95.0        1
NaN         1
Name: count, Length: 98, dtype: int64
 
Number of missing values (NaNs) in 'Degerlendirme Puani':
1
 
count    65124.000000
mean        32.086466
std         18.139239
min          0.000000
25%         18.000000
50%         29.000

In [11]:
train_df['Degerlendirme Puani'] = train_df['Degerlendirme Puani'].fillna(train_df['Degerlendirme Puani'].median())

In [12]:
analyze_column(train_df, 'Degerlendirme Puani')

Unique values in 'Degerlendirme Puani':
[ 52.  30.  18.  40.  24.  37.  31.  28.  26.  58.  34.  33.  48.  10.
   8.  54.  14.  44.  60.  42.  22.  16.  29.  46.  41.  50.  53.  36.
  32.  38.  20.  12.  51.  65.  56.  61.  62.  49.  70.  43.  23.  64.
  59.  57.  63.  71.  98.  21.  66.  27.  39.  68.  85.  45.  67.  47.
  78.  19.  13.  90.  72.  77.  55.   6.  79.  76.  75.  74.  73.  17.
  80.  84.  86.  82.  88.  92.  94.  96. 102.   7.   4.  15.  35.  25.
   5.  11.   9.   3.   2.   1.   0.  69.  81.  89.  83. 100.  95.]
 
Value counts in 'Degerlendirme Puani' (including NaNs):
Degerlendirme Puani
28.0     2063
22.0     2041
18.0     1995
24.0     1785
12.0     1740
         ... 
85.0        2
83.0        2
100.0       2
89.0        1
95.0        1
Name: count, Length: 97, dtype: int64
 
Number of missing values (NaNs) in 'Degerlendirme Puani':
0
 
count    65125.000000
mean        32.086418
std         18.139104
min          0.000000
25%         18.000000
50%         29.000000
7

### 2.2-) Cinsiyet Column

In [13]:
analyze_column(train_df, 'Cinsiyet')

Unique values in 'Cinsiyet':
['Erkek' 'Kadın' 'ERKEK' 'Belirtmek istemiyorum' nan]
 
Value counts in 'Cinsiyet' (including NaNs):
Cinsiyet
Kadın                    32077
Erkek                    22752
ERKEK                    10012
NaN                        169
Belirtmek istemiyorum      115
Name: count, dtype: int64
 
Number of missing values (NaNs) in 'Cinsiyet':
169
 
count     64956
unique        4
top       Kadın
freq      32077
Name: Cinsiyet, dtype: object


In [14]:
train_df['Cinsiyet'] = train_df['Cinsiyet'].replace('ERKEK', 'Erkek')
train_df['Cinsiyet'].isin(['ERKEK']).any()

False

In [15]:
# Replacing the NaN values with 'Erkek'(Mod)

train_df['Cinsiyet'] = train_df['Cinsiyet'].fillna('Erkek')
train_df['Cinsiyet'].value_counts(dropna = False)

Cinsiyet
Erkek                    32933
Kadın                    32077
Belirtmek istemiyorum      115
Name: count, dtype: int64

In [16]:
analyze_column(train_df, 'Cinsiyet')

Unique values in 'Cinsiyet':
['Erkek' 'Kadın' 'Belirtmek istemiyorum']
 
Value counts in 'Cinsiyet' (including NaNs):
Cinsiyet
Erkek                    32933
Kadın                    32077
Belirtmek istemiyorum      115
Name: count, dtype: int64
 
Number of missing values (NaNs) in 'Cinsiyet':
0
 
count     65125
unique        3
top       Erkek
freq      32933
Name: Cinsiyet, dtype: object


In [17]:
label_encoder = LabelEncoder()
train_df['Cinsiyet'] = label_encoder.fit_transform(train_df['Cinsiyet'])

In [18]:
analyze_column(train_df, 'Cinsiyet')

Unique values in 'Cinsiyet':
[1 2 0]
 
Value counts in 'Cinsiyet' (including NaNs):
Cinsiyet
1    32933
2    32077
0      115
Name: count, dtype: int64
 
Number of missing values (NaNs) in 'Cinsiyet':
0
 
count    65125.000000
mean         1.490779
std          0.503439
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          2.000000
Name: Cinsiyet, dtype: float64


### 2.3-) Dogum Tarihi

In [19]:
standardize_dates(train_df, 'Dogum Tarihi')

Unnamed: 0,Basvuru Yili,Degerlendirme Puani,Cinsiyet,Dogum Tarihi,Dogum Yeri,Ikametgah Sehri,Universite Adi,Universite Turu,Burslu ise Burs Yuzdesi,Burs Aliyor mu?,...,Spor Dalindaki Rolunuz Nedir?,Aktif olarak bir STK üyesi misiniz?,Hangi STK'nin Uyesisiniz?,Stk Projesine Katildiniz Mi?,Girisimcilikle Ilgili Deneyiminiz Var Mi?,Girisimcilikle Ilgili Deneyiminizi Aciklayabilir misiniz?,Ingilizce Biliyor musunuz?,Ingilizce Seviyeniz?,"Daha Önceden Mezun Olunduysa, Mezun Olunan Üniversite",id
0,2014,52.0,1,1994-06-04,"Altindag, Ankara",Ankara,İHSAN DOĞRAMACI BİLKENT,Özel,100.0,Evet,...,Diğer,Hayır,,Evet,Hayır,,,,,0
1,2014,30.0,1,1993-11-06,Üsküdar,İstanbul,İHSAN DOĞRAMACI BİLKENT,Özel,100.0,Hayır,...,0,Hayır,,Evet,Evet,Bilkent Cyberparkta bir şirkette Türkiye nin i...,,,,1
2,2014,18.0,1,1986-01-15,Samsun,İstanbul,ULUSLARARASI KIBRIS ÜNİVERSİTESİ,Özel,100.0,Hayır,...,0,Hayır,,Hayır,Hayır,,,,,2
3,2014,40.0,1,1991-04-06,Diyarbakır,İstanbul,İSTANBUL ŞEHİR ÜNİVERSİTESİ,Özel,100.0,Evet,...,0,Hayır,,Evet,Hayır,,,,,3
4,2014,24.0,1,,Ankara/Altındağ,Ankara,TURGUT ÖZAL ÜNİVERSİTESİ,Özel,100.0,Evet,...,0,Hayır,,Hayır,Hayır,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65120,2022,0.0,1,2003-06-18,İstanbul,İstanbul,İstanbul Aydın Üniversitesi,Özel,,Evet,...,-,Hayır,-,,Hayır,-,Hayır,,,65120
65121,2022,0.0,1,2002-04-18,Hatay,Muğla,Muğla Sıtkı Koçman Üniversitesi,Devlet,,Hayır,...,-,Hayır,-,,Hayır,-,Hayır,,,65121
65122,2022,0.0,1,1999-10-23,Mersin,Mersin,Süleyman Demirel Üniversitesi,Devlet,,Hayır,...,-,Hayır,-,,Hayır,-,Hayır,,,65122
65123,2022,0.0,1,2000-11-17,Samsun,Samsun,Karabük Üniversitesi,Devlet,,Hayır,...,-,Hayır,-,,Hayır,-,Hayır,,,65123


In [20]:
analyze_dataframe(train_df)

SHAPE
(65125, 46)

----------------------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65125 entries, 0 to 65124
Data columns (total 46 columns):
 #   Column                                                     Non-Null Count  Dtype         
---  ------                                                     --------------  -----         
 0   Basvuru Yili                                               65125 non-null  int64         
 1   Degerlendirme Puani                                        65125 non-null  float64       
 2   Cinsiyet                                                   65125 non-null  int64         
 3   Dogum Tarihi                                               62009 non-null  object        
 4   Dogum Yeri                                                 64334 non-null  object        
 5   Ikametgah Sehri                                            63088 non-null  object        
 6   Universite Adi                                       

In [21]:
train_df['Dogum Tarihi'] = train_df['Parsed Date']

In [22]:
train_df.drop(columns=['Parsed Date', 'Standardized Date'], inplace=True)

In [23]:
reference_date = pd.Timestamp('2023-01-01')

# Calculate the age by dividing the timedelta by one year (in days)
train_df['Dogum Tarihi'] = (reference_date - train_df['Dogum Tarihi']).dt.days / 365.25

print(train_df[['Dogum Tarihi']])

       Dogum Tarihi
0         28.577687
1         29.152635
2         36.960986
3         31.739904
4               NaN
...             ...
65120     19.540041
65121     20.706366
65122     23.192334
65123     22.121834
65124     22.680356

[65125 rows x 1 columns]


In [24]:
analyze_column(train_df, 'Dogum Tarihi')

Unique values in 'Dogum Tarihi':
[28.57768652 29.15263518 36.96098563 ... 19.15126626 19.72895277
 19.137577  ]
 
Value counts in 'Dogum Tarihi' (including NaNs):
Dogum Tarihi
 NaN          3116
-47.003422    1655
 23.000684     290
 21.998631     247
 24.000000     230
              ... 
 31.635866       1
 30.012320       1
 33.021218       1
 6.989733        1
 19.137577       1
Name: count, Length: 5050, dtype: int64
 
Number of missing values (NaNs) in 'Dogum Tarihi':
3116
 
count    62009.000000
mean        23.365211
std         12.127657
min        -47.173169
25%         22.639288
50%         24.928131
75%         27.501711
max        134.354552
Name: Dogum Tarihi, dtype: float64


In [25]:
outlier_values = list_outlier_values(train_df, 'Dogum Tarihi')
print(outlier_values)

[36.96098562628337, 35.15947980835045, 52.28199863107461, 52.99931553730322, 36.95003422313484, 36.96646132785763, 36.5201916495551, 52.99931553730322, 52.99931553730322, 6.551676933607118, 6.401095140314853, 6.182067077344285, 6.324435318275154, 5.689253935660506, 1.0458590006844628, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.00342231348392, -47.0034223134

In [28]:
train_df.loc[train_df['Dogum Tarihi'] < 14, 'Dogum Tarihi'] = 0
train_df.loc[train_df['Dogum Tarihi'] > 34, 'Dogum Tarihi'] = 0

In [29]:
outlier_values = list_outlier_values(train_df, 'Dogum Tarihi')
print(outlier_values)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [30]:
#Calculate the mean value of the column, ignoring NaNs
mean_value = train_df['Dogum Tarihi'].replace(0, np.nan).mean()

#Replace NaNs and 0s with the mean value
train_df['Dogum Tarihi'] = train_df['Dogum Tarihi'].replace(0, np.nan).fillna(mean_value)

In [33]:
analyze_column(train_df, 'Dogum Tarihi')

Unique values in 'Dogum Tarihi':
[28.57768652 29.15263518 25.26359953 ... 19.15126626 19.72895277
 19.137577  ]
 
Value counts in 'Dogum Tarihi' (including NaNs):
Dogum Tarihi
25.263600    4996
23.000684     290
21.998631     247
24.000000     230
24.999316     228
             ... 
32.755647       1
31.578371       1
31.247091       1
33.505818       1
19.137577       1
Name: count, Length: 4879, dtype: int64
 
Number of missing values (NaNs) in 'Dogum Tarihi':
0
 
count    65125.000000
mean        25.263600
std          2.901657
min         16.114990
25%         22.978782
50%         25.263600
75%         27.356605
max         33.998631
Name: Dogum Tarihi, dtype: float64


In [None]:
anal