In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, ConfusionMatrixDisplay

# SOAL TIPE DATA

In [None]:
# Data

company_name_list = [{'name': 'Company 1'},
          {'name': 'Company 2'},
          {'name': 'Company 3'}]

employee_name_list = [{'name': 'John Doe'},
          {'name': 'Tom Smith'},
          {'name': 'Andrew Sebastian'}]

company_detail_list = {
      'Company 1': {
          'name': 'Company 1',
          'domain': 'Retail',
          'clients': [
              {
                  'name': 'acme.inc',
                  'country': 'united states'
              },
              {
                  'name': 'Wayne.co',
                  'country': 'united states'
              }
          ]
      },
      'Company 2': {
          'name': 'Company 2',
          'domain': 'Construction',
          'clients': [
              {
                  'name': 'Tesla',
                  'country': 'united states'
              },
              {
                  'name': 'Japan Airlines',
                  'country': 'japan'
              },
              {
                  'name': 'Indofood',
                  'country': 'indonesia'
              }
          ]
      },
      'Company 3': {
          'name': 'Company 3',
          'domain': 'Healthcare',
          'clients': [
              {
                  'name': 'Petronas',
                  'country': 'malaysia'
              },
              {
                  'name': 'VW Group',
                  'country': 'germany'
              },
              {
                  'name': 'IBM',
                  'country': 'united states'
              },
              {
                  'name': 'Mitsubishi',
                  'country': 'japan'
              }
          ]
      }
  }

employee_detail_list = {
      'John Doe': {
          'name': 'EMP-0001',
          'first_name': 'John',
          'last_name': 'Doe',
          'full_name': 'John Doe',
          'company': 'Company 1'
      },
      'Tom Smith': {
          'name': 'EMP-0002',
          'first_name': 'Tom',
          'last_name': 'Smith',
          'full_name': 'Tom Smith',
          'company': 'Company 2'
      },
      'Andrew Sebastian': {
          'name': 'EMP-0003',
          'first_name': 'Andrew',
          'last_name': 'Sebastian',
          'full_name': 'Andrew Sebastian',
          'company': 'Company 2'
      },
  }

### 1

In [4]:
sorted_companies = sorted(
    [{'name': company['name'], 'domain': company_detail_list[company['name']]['domain']}
     for company in company_name_list],
    key=lambda x: x['domain'],
    reverse=True
)

print(sorted_companies)


[{'name': 'Company 1', 'domain': 'Retail'}, {'name': 'Company 3', 'domain': 'Healthcare'}, {'name': 'Company 2', 'domain': 'Construction'}]


### 2

In [5]:
for company in company_name_list:
    company_name = company['name']
    domain = company_detail_list[company_name]['domain']
    num_clients = len(company_detail_list[company_name]['clients'])
    print(f"{company_name}: {domain}, relation: {num_clients} clients")

Company 1: Retail, relation: 2 clients
Company 2: Construction, relation: 3 clients
Company 3: Healthcare, relation: 4 clients


### 3

In [6]:
def get_employee_company_domain():
    return [{'full_name': employee_detail['full_name'],
             'company': employee_detail['company'],
             'domain': company_detail_list[employee_detail['company']]['domain']}
            for employee_name, employee_detail in employee_detail_list.items()]

print(get_employee_company_domain())

[{'full_name': 'John Doe', 'company': 'Company 1', 'domain': 'Retail'}, {'full_name': 'Tom Smith', 'company': 'Company 2', 'domain': 'Construction'}, {'full_name': 'Andrew Sebastian', 'company': 'Company 2', 'domain': 'Construction'}]


#### 4

In [7]:
def get_companies_with_employees():
    return [{'company': company['name'],
             'employees': [employee_detail['full_name'] for employee_detail in employee_detail_list.values() if employee_detail['company'] == company['name']]}
            for company in company_name_list]

print(get_companies_with_employees())

[{'company': 'Company 1', 'employees': ['John Doe']}, {'company': 'Company 2', 'employees': ['Tom Smith', 'Andrew Sebastian']}, {'company': 'Company 3', 'employees': []}]


# SOAL PRE-PROCESSING DATA

In [6]:
data_startup = pd.read_csv('50_Startups.csv')
data_startup.sample(7)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State
15,0.884714,-0.00681,0.353811,0.454805,New York
9,1.0913,-0.554449,0.75405,0.954339,California
28,-0.251744,2.351839,-0.976707,-0.215706,Florida
22,-0.065514,-0.000291,0.738651,-0.037726,Florida
34,-0.711874,1.371439,-0.118436,-0.381091,California
30,-0.346863,-0.280894,-1.226982,-0.299909,Florida
39,-0.896334,-1.564139,-0.450059,-0.776507,California


### 1

In [7]:
# Membaca dataset
df = pd.read_csv("50_Startups.csv")

# Menampilkan field mana saja yang memiliki data kosong
fields_with_missing_data = df.columns[df.isnull().any()]
print("Fields dengan data kosong:", fields_with_missing_data)

# Mengisi data kosong dengan nilai mean
for field in fields_with_missing_data:
    mean_value = df[field].mean()
    df[field].fillna(mean_value, inplace=True)

# Menyimpan hasil perubahan
df.to_csv("50_Startups.csv", index=False)

df.sample(5)

Fields dengan data kosong: Index([], dtype='object')


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State
23,-1.444366e-16,-0.669502,1.598721e-16,-0.07846521,Florida
15,0.8847141,-0.00681,0.3538112,0.4548054,New York
1,2.01184,1.123258,2.040926,2.012472,California
31,-1.444366e-16,1.175293,-1.253967,7.50435e-16,New York
29,-0.2622012,1.188265,-1.078698,-0.273047,New York


### 2

In [9]:
# Membaca dataset
df = pd.read_csv("50_Startups.csv")

# Melakukan OneHotEncoder ke field State
df = pd.get_dummies(df, columns=['State'])

# Menyimpan hasil perubahan
df.to_csv("nama_file_onehot.csv", index=False)

df.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
49,-1.444366e-16,-0.228143,-1.652722,-2.446183,True,False,False
6,1.35578,0.959067,-0.888067,1.114512,True,False,False
27,-0.109755,0.199383,1.200578,-0.172257,False,False,True
39,-0.8963339,-1.564139,-0.450059,-0.776507,True,False,False
34,-0.7118741,1.371439,-0.118436,-0.381091,True,False,False


### 3

In [11]:
# Membaca dataset
df = pd.read_csv("50_Startups.csv")

# Menghitung nilai Tax
df['Tax'] = (df['Profit'] + df['Marketing Spend'] + df['Administration']) * 0.05

# Menyimpan hasil perubahan
df.to_csv("nama_file_tax.csv", index=False)

df.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State,Tax
47,-1.444366e-16,0.496524,1.598721e-16,-1.744362,California,-0.062392
23,-1.444366e-16,-0.669502,1.598721e-16,-0.07846521,Florida,-0.037398
13,0.356456,0.499201,0.2694074,7.50435e-16,California,0.03843
6,1.35578,0.959067,-0.8880668,1.114512,California,0.059276
15,0.8847141,-0.00681,0.3538112,0.4548054,New York,0.04009


### 4

In [12]:
from sklearn.preprocessing import StandardScaler

# Membaca dataset
df = pd.read_csv("50_Startups.csv")

# Menghapus kolom non-numerik (State)
df_numerical = df.drop(columns=['State'])

# Inisialisasi StandardScaler
scaler = StandardScaler()

# Melakukan scaling ke field-field numerik
scaled_data = scaler.fit_transform(df_numerical)

# Membuat DataFrame baru dari data yang sudah di-scale
df_scaled = pd.DataFrame(scaled_data, columns=df_numerical.columns)

# Menambahkan kolom State yang telah dihapus sebelumnya
df_scaled['State'] = df['State']

# Menyimpan hasil perubahan
df_scaled.to_csv("50_Startups.csv", index=False)

df.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State
45,-1.776911,0.05355035,-2.053552,-1.181303,New York
32,-0.313702,0.252626,-1.644272,-0.363091,California
7,1.254557,6.655787e-16,0.929087,1.1052,Florida
39,-0.896334,-1.564139,-0.450059,-0.776507,California
43,-1.43682,0.1804345,-1.742014,-1.059638,New York
