In [None]:
import pandas as pd

**Collection of Data**

In [None]:
legitimate_urls = pd.read_csv("legitimate-urls.csv")
phishing_urls = pd.read_csv("phishing-urls.csv")

In [None]:
legitimate_urls.head(10)
phishing_urls.head(10)

## Data PreProcessing
#### Data is in two data frames so we merge them to make one dataframe


In [None]:
urls = legitimate_urls.append(phishing_urls)


In [None]:
urls.head(5)

In [None]:
urls.shape[1]

In [None]:
urls.columns

#### Removing Unnecessary columns

In [None]:
urls = urls.drop(urls.columns[[0,3,5]],axis=1)

In [None]:
# shuffling the rows in the dataset so that when splitting the train and test set are equally distributed
urls = urls.sample(frac=1).reset_index(drop=True)

#### Removing class variable from the dataset

In [None]:
urls_without_labels = urls.drop('label',axis=1)
urls_without_labels.columns
labels = urls['label']


#### splitting the data into train data and test data

Dividing the data in the ratio of 70:30 [train_data:test_data]

In [None]:
from sklearn.model_selection import train_test_split
data_train, data_test, labels_train, labels_test = train_test_split(urls_without_labels, labels, test_size=0.30, random_state=110)

In [None]:
print(len(data_train),len(data_test),len(labels_train),len(labels_test))

#### checking the split of labels in train and test data

The split should be in equal proportion for both classes

Phishing - 1

Legitimate - 0


In [None]:
#initially checking the split of labels_train data 
labels_train.value_counts()


In [None]:
#checking the split for labels_test data
labels_test.value_counts()

as the split is almost in equal proportion we can train the model

#### Creating the model and fitting the data into the model

creating the model with default parameters

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = RandomForestClassifier()

In [None]:
random_forest_classifier.fit(data_train,labels_train)


#### Predicting the result for test data

In [None]:
prediction_label = random_forest_classifier.predict(data_test)

#### Creating confusion matrix and checking the accuracy

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cpnfusionMatrix = confusion_matrix(labels_test,prediction_label)
print(cpnfusionMatrix)
accuracy_score(labels_test,prediction_label)

### Improving the efficiency of model by specifying max_depth as well as number of tress 

In [None]:
custom_random_forest_classifier = RandomForestClassifier(n_estimators=500, max_depth=20, max_leaf_nodes=10000)

In [None]:
custom_random_forest_classifier.fit(data_train,labels_train)

In [None]:
custom_classifier_prediction_label = custom_random_forest_classifier.predict(data_test)

In [None]:
#from sklearn.metrics import confusion_matrix,accuracy_score
confusionMatrix2 = confusion_matrix(labels_test,custom_classifier_prediction_label)
print(confusionMatrix2)
accuracy_score(labels_test,custom_classifier_prediction_label)

## Feature Importance Plot

In [None]:
import matplotlib.pyplot as plt
import numpy as np

#feature_importances_ : array of shape = [n_features] ------ The feature importances (the higher, the more important the feature).

#feature_importances_  -- This method returns the quantified relative importance in the order the features were fed to the algorithm

importances = custom_random_forest_classifier.feature_importances_

#std = np.std([tree.feature_importances_ for tree in custom_random_forest_classifier.estimators_],axis=0)   #[[[estimators_ :explaination ---  list of DecisionTreeClassifier ----- (The collection of fitted sub-estimators.)]]]

#To make the plot pretty, we’ll instead sort the features from most to least important.
indices = np.argsort(importances)[::-1] 
print(f"indices of columns : {indices}")

# Print the feature ranking
print("\n ***Feature ranking: *** \n")
print("Feature name : Importance")

for f in range(data_train.shape[1]):
    print(f"{f+1} {data_train.columns[indices[f]]}   :  {importances[indices[f]]} \n")
    
print("**** The blue bars are the feature importances of the randomforest classifier, along with their inter-trees variability*****")

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(data_train.shape[1]), importances[indices],
       color="b", align="center")   
#yerr=std[indices] -- this is another parameter that can be included if std is calculated above
#and also it gives error bar that's the reason we calculate std above. but here we are not making it plot.

plt.xticks(range(data_train.shape[1]), data_train.columns[indices])
plt.xlim([-1, data_train.shape[1]])

plt.rcParams['figure.figsize'] = (35,15)  #this will increase the size of the plot
plt.show()

In [None]:
#click on the image to get clear view