#### Importing libraries

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')
import missingno as msno
import dateparser

from scipy import stats
from scipy.stats import norm, skew

#### Reading data

In [None]:
train = pd.read_csv("binaaz_train.csv")
test = pd.read_csv("binaaz_test.csv")
attractions = pd.read_excel('baku_coordinates.xlsx')
train.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.columns

In [None]:
# popular =  list(zip(attractions.Latitude, attractions.Longitude))
# popular

In [None]:
# points = pd.DataFrame()
# points['points'] = list(zip(train.latitude, train.longitude))

In [None]:
# test_points = pd.DataFrame()
# test_points['points'] = list(zip(test.latitude, test.longitude))

In [None]:
# from geopy.distance import great_circle
# from tqdm import tqdm 

# for key,value in tqdm(zip(attractions['Title'],popular)):
#     train[key] = points['points'].apply(lambda x: great_circle(value, x).km)
    
# for key,value in tqdm(zip(attractions['Title'],popular)):
#     test[key] = test_points['points'].apply(lambda x: great_circle(value, x).km)    


In [None]:
test.info(verbose = True)

### Feature Extraction

In [None]:
def feature_extraction(df):   
    # null values
    df['Kupça'] = df['Kupça'].fillna(df['Kupça'].mode()[0], axis=0)
    df['description'] = df['description'].fillna(method='bfill', axis=0).fillna('empty description')


    df['kupca'] = pd.get_dummies(df['Kupça'], drop_first=True)
    df['mort'] = pd.get_dummies(df['İpoteka'], drop_first=True)
    df['poster_type'] = pd.get_dummies(df['poster_type'], drop_first=True)
    df['area'] = df['Sahə'].str.split().str[0].astype(float)
    df['rel_floor'] = df['Mərtəbə'].apply(lambda x: int(x.split('/')[0])/int(x.split('/')[1]))
    df['floor'] = df['Mərtəbə'].str.split('/').str[0].astype(float)
    df['AreaPerRooms'] = df['area']/df['Otaq sayı']
    
    df['kristalAbseron'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ["Kristal Abseron",'Kristal Abşeron',"Kistal Abşeron"]]))
    df['nearHospital'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['xestexana','klinika','klinka', 'xəstəxana', 'balnisiya', 'dogum evi', 'doğum evi']]))
    df['university'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['universitet','unverstit', 'unversitet']]))
    
    df['urg'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['tecili', 'təcili', 'tacili', 'tcili', 'təcılı','elimyandi', 'əlimyandı']]))
    df['nearHospital'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['xestexana','klinika','klinka', 'xəstəxana', 'balnisiya', 'dogum evi', 'doğum evi']]))
    df['urg'] = df['description'].apply(lambda x: any([w in str(x).lower() for w 
                                                   in ['tecili', 'təcili', 'tacili', 'tcili', 'təcılı','elimyandi', 'əlimyandı']]))
    df['deniz'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['deniz','dəniz']]))
    df['repaired'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['temirli','təmirli','temirri']]))
    df['unrepaired'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['temirsiz','təmirsiz','temırız']]))
    df['designed'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['dizayner','dizayn','dizayin','dızayın', 'dizayın']]))
    df['nearMetro'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['metro']]))
    df['nearGarden'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['bagca','baxça', 'bağça','baxca']]))
    df['podMayak'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['pod mayak', 'padmayak','podmoyak','padmoyak','podmayok','pad mayak','pad mayok']]))
    df['roomy'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['genis','geniş','qenis']]))
    df['perfect'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['ela','əla']]))
    df['communal'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['qaz','su','isig','isiq','işiq','işıq','işığ']]))
    df['notReady'] = df.description.map(lambda x:  any([w in str(x).lower() for w in ["tehvil verilecek", "təhvil veriləcək", "təhfil veriləcək", "təhvil veriləcəy", "hazır deyil", "hazır olacaq", "hazir olacaq"]]))
    df['nearSchool'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['mekteb','məktəb','məktəp','lisey', 'litsey', 'liçsey']]))
    df['nearShopping'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['market','magaza','mağaza','dukan','dükan']]))
    df['kristalAbseron'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ["Kristal Abseron",'Kristal Abşeron',"Kistal Abşeron"]]))
    df['hasContact'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['elaqe','əlaqə','elaqə','nörmə','nomre','nömre']]))
    df['nearHospital'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['xestexana','klinika','klinka', 'xəstəxana', 'balnisiya', 'dogum evi', 'doğum evi']]))
    df['university'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['universitet','unverstit', 'unversitet']]))
    df['funcenter'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['eylence','əyləncə']]))
    df['liveable'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['yaşayış', 'yasayis', 'yaşayiş']]))
    df['lift'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['lift','lifd']]))
    df['fastLift'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['suretli lift',' suretli lifd','sürətli lift']]))
    df['garage'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['qaraj','garaj']]))
    df['supermarket'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['araz','spar','bravo','bazarstore', 'bazar stor']]))
    df['furniture'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['mebel', 'əşya', 'esya','avadanlig','avadanliq','avadanlıq', 'avadanlığ']]))
    df['jacuzzi'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['jakuzzi','jacuzzi','modern hamam','super hamam']]))
    df['tax'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['1%']]))
    df['floor'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['parket', 'isti döşəmə']]))
    df['heatSystem'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['kombi','istiliy sistem','istilik sistem']]))
    df['security'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['tehlukesiz','təhlükəsiz', 'təhlükesiz']]))
    df['newlyBuilt'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['yeni tikili', 'teze tikili']]))
    df['newlyReaired'] = df['description'].apply(lambda x: any([w in str(x).lower() for w in ['yeni temir', 'yeni təmir', 'təzə temir', 'teze temir', 'yenicə təmir', 'yenice temir']]))
    
    
    return df

In [None]:
train['floor'] = train['Mərtəbə'].str.split('/').str[0].astype(float)
test['floor'] = test['Mərtəbə'].str.split('/').str[0].astype(float)

In [None]:
train = feature_extraction(train)
test = feature_extraction(test)

### Outliers

In [None]:
train.describe()

In [None]:
outlier = ['price','floor','Otaq sayı','latitude','area','AreaPerRooms']

for i in outlier:
    sns.boxplot(train[i])
    plt.show()

In [None]:
def remove_outliers(df):
    df = df[(df['area']>10)&(df['area']<1000)]
    df = df[df['Otaq sayı']<18]
    df = df[df['floor']<26]
    df = df[(df['price']>15000)&(df['price']<1000000)]
    df = df[(df['price']/df['area']>200)&(df['price']/df['area']<4000)]
    df = df[(df['AreaPerRooms']>12)&(df['AreaPerRooms']<95)]
    df = df[df['latitude']<43.5]
    
    return df

In [None]:
train = remove_outliers(train)

In [None]:
for i in outlier:
    sns.boxplot(train[i])
    plt.show()

In [None]:
train.describe()

### Visualization

In [None]:
df = train

In [None]:
ax = sns.scatterplot(x="AreaPerRooms", y="price", data=df);
ax.set_title("AreaPerRooms and Price")
ax.set_xlabel("AreaPerRooms");
ax.set_ylabel("Price");

In [None]:
plt.scatter(train['area'], df['price']) 

plt.title("Area and Price")
plt.xlabel('Area')
plt.ylabel('Price')
  
plt.show()

In [None]:
sns.displot(df['price'])

In [None]:
print(df['price'].skew())
print(df['price'].kurt())

In [None]:
sns.distplot(df['price'] , fit=norm);

(mu, sigma) = norm.fit(df['price'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(df['price'], plot=plt)
plt.show()

### Correlation

In [None]:
plt.figure(figsize=(8,8))
cor = df[['price', 'Otaq sayı', 'area', 'rel_floor', 'AreaPerRooms']].corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.PuBu)
plt.show()

In [None]:
cor_target = abs(cor["price"]) # absolute value of the correlation 

relevant_features = cor_target[cor_target>0.2] # highly correlated features 

names = [index for index, value in relevant_features.iteritems()] # getting the names of the features 

names.remove('price') # removing target feature 

print(names) # printing the features 
print(len(names))

#### Building Model

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
print(train.shape, test.shape)

In [None]:
drop_list=['title', 'price_currency', 'poster', 'Ünvan', 'description', 'Yeniləndi', 'Mərtəbə', 'Sahə', 'Kupça', 'İpoteka', 'locations', 'seher']
train.drop(drop_list,axis=1,inplace=True)
test.drop(drop_list,axis=1,inplace=True)

In [None]:
x=train.copy()
x.drop('price',axis=1,inplace=True)
y=train['price']
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=20000,random_state=171)
model = XGBRegressor()
paramater = { }
clf = GridSearchCV(model, paramater, cv=5)

clf.fit(X_train,Y_train)
Y_train_pred=clf.predict(X_train)
Y_test_pred=clf.predict(X_test)

In [None]:
train_score=clf.score(X_train, Y_train)
test_score=clf.score(X_test, Y_test)
test_score

#### Hyperparameter Tuning

In [None]:
def results(Y_train, Y_train_pred, Y_test, Y_test_pred, model, score, X_train, X_test):
    print(type(model).__name__)
    print('Train Mse: {}'.format(mean_squared_error(Y_train, Y_train_pred)))
    print('Train Score: {}'.format(model.score(X_train, Y_train)))
    print('Test Mse: {}'.format(mean_squared_error(Y_test, Y_test_pred)))
    print('Test Score: {}'.format(model.score(X_test, Y_test)))
    print('Mean of Cross Validation Score: {}'.format(np.round(score.mean(),4)))
    print('------------------------------------------------------')

In [None]:
def model_tunings(x,y, model_params):
    results=[]
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=20000, random_state=171)  
    for model_name, mp in model_params.items():
        clf=GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=True)
        clf.fit(X_train, Y_train)
        Y_train_pred=clf.predict(X_train)
        Y_test_pred=clf.predict(X_test)
        train_error=mean_squared_error(Y_train, Y_train_pred)
        train_score=clf.score(X_train, Y_train)
        test_error=mean_squared_error(Y_test, Y_test_pred)
        test_score=clf.score(X_test, Y_test)
        print(model_name)
        print('Train Mse: {}'.format(train_error))
        print('Train Score: {}'.format(train_score))
        print('Test Mse: {}'.format(test_error))
        print('Test Score: {}'.format(test_score))
        print('------------------------------------------------------')
        results.append({'model': model_name, 
                        'Train Error': train_error,
                        'Train Score': train_score,
                        'Test Error': test_error,
                        'Test Score': test_score,
                        'best_score': clf.best_score_,
                        'best_params': clf.best_params_})
    return results

In [None]:
model_parameters={
    'LGBMRegressor': {
        'model': LGBMRegressor(),
        'params': {'learning_rate ': [0.001, 0.0005, 0.01], 'max_depth': [7,10,13,16], 
                   'n_estimators': list(range(200,1100,100))}},
    'XGBRegressor': {
        'model' : XGBRegressor(),
        'params' : {'reg_alpha': [0.55, .6, 0.51],
                    'reg_lambda': [0.5, 0.4, 0.55],
                    'max_depth': list(range(7, 20, 2))}
    }
}

In [None]:
X = train.drop('price', axis=1)
Y = train[['price']]

In [None]:
scores = model_tunings(X, Y, model_parameters)

In [None]:
res = pd.DataFrame(scores, columns=['model', 'Train Error', 'Train Score', 'Test Error', 'Test Score',
       'best_score', 'best_params'])

In [None]:
res_sorted=res.sort_values('Test Error', ascending = True)
res_sorted

In [None]:
res_sorted.best_params[1]

#### Kaggle Part

In [None]:
X = train.drop('price', axis=1)
Y = train[['price']]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=20000,random_state=42)
print(X.shape, Y.shape)

In [None]:
test.shape

In [None]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

In [None]:
model = XGBRegressor(max_depth=11, reg_alpha=0.51, reg_lambda=0.55)
# model = LGBMRegressor(learning_rate=0.001, max_depth= 10, n_estimators=1000)
model.fit(X,Y)
tuned_pr=model.predict(test)

In [None]:
test_score= model.score(X_test, Y_test)
test_score

In [None]:
tuned_res=[i for i in tuned_pr]

In [None]:
submission=pd.DataFrame()
submission['_id']=test['_id']
submission['price']=tuned_res

In [None]:
submission.to_csv('least.csv',index=False)