In [1]:
import glob
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np
from scipy.stats import pearsonr
import statistics
from sklearn.model_selection import train_test_split,GroupShuffleSplit
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor)
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import (
    cross_val_score,
    RepeatedKFold,
    RandomizedSearchCV,
    KFold,
    train_test_split,
    GridSearchCV,
    GroupKFold)
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, pearsonr
from math import sqrt
import warnings
from scipy import stats
from sklearn.exceptions import ConvergenceWarning

# Suppress ConstantInputWarning
warnings.filterwarnings("ignore", category=stats.ConstantInputWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
class base:
    def __init__(self, **kwargs):
        self.features = kwargs["features"]
        self.response = kwargs["response"]

    def preProcess_features(self, **kwargs):
        if kwargs["rescale_type"] == "norm":
            standardized = preprocessing.StandardScaler()
            features_processed = standardized.fit_transform(np.array(self.features))
        elif kwargs["rescale_type"] == "minmax":
            norm = preprocessing.MinMaxScaler()
            features_processed = norm.fit_transform(np.array(self.features))
        else:
            print(str(kwargs["rescale_type"])+ "rescaling technique not implemented \n Defaulting to standardized variables")
            standardized = preprocessing.StandardScaler()
            features_processed = standardized.fit_transform(np.array(self.features))

        self.features_processed = features_processed
        return features_processed

    def predict(self, **kwargs):
        features_to_predict = kwargs["features"]
        self.predictions = self.model.predict(features_to_predict)

    def run_CVs(self, **kwargs):

        standardized = preprocessing.StandardScaler()
        norm = preprocessing.MinMaxScaler()

        x_data = kwargs["features"]
        rescale_type = kwargs['rescale_type']

        # Normalise or standardize, two different forms of rescaling
        if kwargs["rescale_type"] == "norm":
            x_data = standardized.fit_transform(np.array(x_data))
        elif kwargs["rescale_type"] == "minmax":
            x_data = norm.fit_transform(np.array(x_data))
        else:
            print("rescaling technique not implemented \n Defaulting to standardized variables")
            x_data = standardized.fit_transform(np.array(x_data))

        y_data = np.array(kwargs["response"])
        n_folds = kwargs["n_folds"]
        title = kwargs["title"]

        kf = KFold(n_splits=n_folds)
        df = {}
        fold_indices = {}

        count = 1
        for train_index, test_index in kf.split(x_data):
            X_train, X_test = x_data[train_index], x_data[test_index]
            y_train, y_test = y_data[train_index], y_data[test_index]

            model_cv = kwargs["model_type"]
            model_cv.fit(X_train, y_train)

            fold = model_cv.predict(X_test)
            df[f"Pred{count}"] = fold
            df[f"Obs{count}"] = y_test

            fold_indices[f"Train{count}"] = train_index
            fold_indices[f"Test{count}"] = test_index

            count += 1

        fig = plt.figure(figsize=(20, 15))

        count = 1

        comb_cv_obs = []
        comb_cv_preds = []

        # Wether or not to visualise the cross validation
        try:
            visualize = kwargs["visualize"]
        except:
            visualize = None

        if visualize:
            for i in range(n_folds):
                print(i)
                print(f"23{count}")
                ax = fig.add_subplot(int(n_folds/2),2,count)
                sns.regplot(x=df[f"Obs{count}"], y=df[f"Pred{count}"])
                ax.spines["right"].set_visible(False)
                ax.spines["top"].set_visible(False)
                ax.set_ylabel("Predicted")
                ax.set_xlabel("Observed")
                ax.set_title(f"Fold{count}")
                r_val, pval = spearmanr(df[f"Obs{count}"], df[f"Pred{count}"])
                r2_val = round(r_val ** 2, 2)
                x_cord, y_cord = max(df[f"Obs{count}"]) * 0.15, max(df[f"Pred{count}"])
                ax.annotate(f"$R^2 = {r2_val}$", (x_cord, y_cord))

                comb_cv_preds.extend(df[f"Pred{count}"])
                comb_cv_obs.extend(df[f"Obs{count}"])

                count += 1

            R, pVal = spearmanr(comb_cv_obs, comb_cv_preds)
            R2 = round(R ** 2, 2)

            fig.suptitle(f"{title} (Combined data $R^2$ = {R2})")
            plt.tight_layout()
            direc = os.getcwd()
            out_direc = f"{direc}"
            os.makedirs(out_direc, exist_ok=True)
            # plt.savefig(f'{out_direc}/Fold_{title}.png')
            # plt.savefig(f'{out_direc}/Fold_{title}.svg')
            # plt.show()

            fig2 = plt.figure(figsize=(12, 10))
            ax2 = fig2.add_subplot(111)
            sns.regplot(x=comb_cv_obs, y=comb_cv_preds, ax=ax2)
            ax2.spines["top"].set_visible(False)
            ax2.spines["right"].set_visible(False)
            ax2.set_title(title, pad=10)
            r, pval = spearmanr(comb_cv_obs, comb_cv_preds)
            r2 = round(r ** 2, 2)
            x_coord = max(comb_cv_obs) * 0.75
            y_coord = max(comb_cv_preds) * 0.99
            ax2.text(x_coord, y_coord, f"$R^2 = {round(r2,2)}$")
            ax2.set_xlabel("Observations", labelpad=20)
            ax2.set_ylabel("Predictions", labelpad=20)
            fig2.tight_layout()
            fig2.savefig(f'{out_direc}/{title}.png') 
            # fig2.savefig(f'{out_direc}/{title}.svg')

        return df, fold_indices

In [3]:
class LR(base):

    """ Linear Regression Model """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.features_processed = self.preProcess_features(rescale_type=kwargs.get("rescale_type"))

    def train_lr(self, **kwargs):
        model = LinearRegression()

        model.fit(self.features_processed, self.response)
        self.model = model
        return model

    def run_CVs(self, **kwargs):
        df, fold_indices = super().run_CVs(
            model_type=LinearRegression(),
            features=kwargs["features"],
            response=kwargs["response"],
            n_folds=kwargs["n_folds"],
            title=kwargs["title"],
            visualize=kwargs["visualize"],
            rescale_type=kwargs["rescale_type"])

        return df, fold_indices


class RF(base):

    """" Random Forest Regression Model """
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.features_processed = self.preProcess_features(rescale_type=kwargs.get("rescale_type"))


    def grid_search(self, **kwargs):
        n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
        max_features = [1, 2, 3, 4,5,6,7,8,9,10]
        max_depth = [int(x) for x in np.linspace(10, stop=100, num=11)]
        max_depth.append(None)
        min_samples_split = [2, 5, 10]
        min_samples_leaf = [10, 15, 20]
        bootstrap = [True, False]
        random_grid = {
            "n_estimators": n_estimators,
            "max_features": max_features,
            "max_depth": max_depth,
            "min_samples_split": min_samples_split,
            "min_samples_leaf": min_samples_leaf,
            "bootstrap": bootstrap}
       
        rf = RandomForestRegressor()
       
        rf_random = RandomizedSearchCV(
            estimator=rf,
            param_distributions=random_grid,
            n_iter=100,
            cv=3,
            verbose=0,
            random_state=42,
            n_jobs=-1)
        rf_random.fit(self.features_processed, self.response)
        # print(rf_random.best_params_)

        self.ran_params = rf_random.best_params_

    def train_rf(self, **kwargs):
        model = RandomForestRegressor(
            n_estimators=self.ran_params["n_estimators"],
            min_samples_leaf=self.ran_params["min_samples_leaf"],
            min_samples_split=self.ran_params["min_samples_split"],
            max_features=self.ran_params["max_features"],
            max_depth=self.ran_params["max_depth"],
            bootstrap=self.ran_params["bootstrap"])

        model.fit(self.features_processed, self.response)
        self.model = model
        
        return model

    def run_CVs(self, **kwargs):
        df, fold_indices = super().run_CVs(
            model_type=RandomForestRegressor(
                n_estimators=self.ran_params["n_estimators"],
                min_samples_leaf=self.ran_params["min_samples_leaf"],
                min_samples_split=self.ran_params["min_samples_split"],
                max_features=self.ran_params["max_features"],
                max_depth=self.ran_params["max_depth"],
                bootstrap=self.ran_params["bootstrap"]),
            features=kwargs["features"],
            response=kwargs["response"],
            n_folds=kwargs["n_folds"],
            title=kwargs["title"],
            visualize=kwargs["visualize"],
            rescale_type=kwargs["rescale_type"])

        return df, fold_indices

    def feature_importance(self, **kwargs):

        importances = self.model.feature_importances_
        self.feature_importance_std = np.std([tree.feature_importances_ for tree in self.model.estimators_], axis=0)
        return importances
    



In [18]:
path='/home/schnablelab/Documents/NNSatelliteImages/Data/'
mainfolder=os.listdir(path)

Correlation=[]

for mainfolders in mainfolder:
    if not mainfolders.endswith('.csv'):
        mainfolderspath=os.path.join(path,mainfolders)
        location=os.path.basename(mainfolderspath)
        # print(location)
        # if not location=='Crawfordsville':
        #     continue
        
        subfolder=os.listdir(mainfolderspath)
        for subfolders in subfolder:
            
            if subfolders=='Satelliteimages':
                continue
                
                satfolderpath=os.path.join(mainfolderspath,subfolders)
                
                satsubfolders=os.listdir(satfolderpath)
                bands='Satellite Image'
                
                for satsubfolder in satsubfolders:
                    
                    if satsubfolder=='sixband':
                        
                        finalsatfolderpath=os.path.join(satfolderpath,satsubfolder)
                    
                        satfiles=os.listdir(finalsatfolderpath)
                        
                        for file in satfiles:
                            if file.endswith('_genotype.csv'):
                                
                                print(file)
                                
                                timepoint=file.split('_')[-2]
                                
                                print(timepoint)
                                
                                datafilepath=os.path.join(finalsatfolderpath,file)
                                
                                datadf=pd.read_csv(datafilepath,index_col=0)
                                
                                datadf=datadf.iloc[:,list(range(0, 39))+[45]+[-1]]
                                # print(datadf.columns)
                                
                                datadf=datadf.dropna(subset=['yieldPerAcre'])
                                
                                features=datadf[datadf.columns[datadf.columns.str.contains('mean|sum|median')]]
                                response=datadf['yieldPerAcre']
                                
                                ######RFmodel############
                                model=RF(response=response,features=features,rescale_type="norm")
                                model.grid_search()
                                RFmodel=model.train_rf(response=response, features=features)
                                
                                
                                for mainfolders in mainfolder:
                                    
                                    if not mainfolders.endswith('.csv'):
                                        
                                        submainfolderspath=os.path.join(path,mainfolders)
                                        sublocation=os.path.basename(submainfolderspath)
                                        # print(sublocation)
                                        
                                        subsubfolder=os.listdir(submainfolderspath)
                                        
                                        for subsubfolders in subsubfolder:
            
                                            if subsubfolders=='Satelliteimages':
                                                # continue

                                                subsatfolderpath=os.path.join(submainfolderspath,subsubfolders)

                                                subsatsubfolders=os.listdir(subsatfolderpath)
                                                subbands='Satellite Image'

                                                for subsatsubfolder in subsatsubfolders:

                                                    if subsatsubfolder=='sixband':
                                                        subfinalsatfolderpath=os.path.join(subsatfolderpath,subsatsubfolder)
                    
                                                        subsatfiles=os.listdir(subfinalsatfolderpath)
                        
                                                        for subfile in subsatfiles:
                                                            if subfile.endswith('_genotype.csv'):
                                        
                                                                subtimepoint=subfile.split('_')[-2]
                                                                
                                                                subdatafilepath=os.path.join(subfinalsatfolderpath,subfile)
                                
                                                                subdatadf=pd.read_csv(subdatafilepath,index_col=0)

                                                                subdatadf=subdatadf.iloc[:,list(range(0, 39))+[45]]
                                                                # print(datadf.columns)

                                                                subdatadf=subdatadf.dropna(subset=['yieldPerAcre'])

                                                                testfeatures=subdatadf[subdatadf.columns[subdatadf.columns.str.contains('mean|sum|median')]]
                                                                testresponse=subdatadf['yieldPerAcre']
                                                        
                                                                Preprocessing=preprocessing.StandardScaler()
                                                                testfeatures=Preprocessing.fit_transform(testfeatures)
                                                                
                                                                results=RFmodel.predict(testfeatures)
                                                                results = results.flatten()
                                                                r,p=pearsonr(results,testresponse)
                                        
                                                                r_squared=r*r
                                                                r_squared=round(r_squared,5)
                                            
                                            
                                                                modelname=f'{location}_{timepoint}'
                                                                testname=f'{sublocation}_{subtimepoint}'
                                                                
                                                                
                                                                Correlation.append({'model':modelname, 'testset':testname,'r2':r_squared})
                                                                
                                                                
            if subfolders=='UAV':
                # print(subfolders)
                    
                    
                
                satfolderpath=os.path.join(mainfolderspath,subfolders)
                # print(satfolderpath)
                
                satsubfolders=os.listdir(satfolderpath)
                print(satsubfolders)
                
                for file in satsubfolders:
#                 bands='Satellite Image'
               
                    if file.endswith('_genotype.csv'):
                        # print(file)

                        timepoint=file.split('_')[-2]

                        # print(timepoint)

                        datafilepath=os.path.join(satfolderpath,file)
                        # print(datafilepath)

                        datadf=pd.read_csv(datafilepath,index_col=0)

                        datadf=datadf.iloc[:,list(range(0, 18))+[24]+[-1]]
                        
                        # print(datadf.columns)

                        datadf=datadf.dropna(subset=['yieldPerAcre'])

                        features=datadf[datadf.columns[datadf.columns.str.contains('mean|sum|median')]]
                        response=datadf['yieldPerAcre']

                        ######RFmodel############
                        model=RF(response=response,features=features,rescale_type="norm")
                        model.grid_search()
                        RFmodel=model.train_rf(response=response, features=features)


                        for mainfolders in mainfolder:

                                if not mainfolders.endswith('.csv'):
                                    submainfolderspath=os.path.join(path,mainfolders)
                                    sublocation=os.path.basename(submainfolderspath)
                                    print(sublocation)

                                    subsubfolder=os.listdir(submainfolderspath)

                                    for subsubfolders in subsubfolder:

                                        if subsubfolders=='UAV':
                                            # continue

                                            subsatfolderpath=os.path.join(submainfolderspath,subsubfolders)

                                            subsatsubfolders=os.listdir(subsatfolderpath)
                                            subbands='Satellite Image'
                                            for subfile in subsatsubfolders:
                                                if subfile.endswith('_genotype.csv'):
                                                    subtimepoint=subfile.split('_')[-2]

                                                    subdatafilepath=os.path.join(subsatfolderpath,subfile)

                                                    subdatadf=pd.read_csv(subdatafilepath,index_col=0)

                                                    subdatadf=subdatadf.iloc[:,list(range(0, 18))+[24]]
                                                    print(datadf.columns)

                                                    subdatadf=subdatadf.dropna(subset=['yieldPerAcre'])

                                                    testfeatures=subdatadf[subdatadf.columns[subdatadf.columns.str.contains('mean|sum|median')]]
                                                    testresponse=subdatadf['yieldPerAcre']

                                                    Preprocessing=preprocessing.StandardScaler()
                                                    testfeatures=Preprocessing.fit_transform(testfeatures)

                                                    results=RFmodel.predict(testfeatures)
                                                    results = results.flatten()
                                                    r,p=pearsonr(results,testresponse)

                                                    r_squared=r*r
                                                    r_squared=round(r_squared,5)


                                                    modelname=f'{location}_{timepoint}'
                                                    testname=f'{sublocation}_{subtimepoint}'


                                                    Correlation.append({'model':modelname, 'testset':testname,'r2':r_squared})                      

                                                                


                                        
                                        

                                
                                
                        
                        
                        

['RGBuav_TP2.csv', 'RGBuav_TP1.csv', 'RGBuav_TP3_genotype.csv', 'RGBuav_TP2_genotype.csv', 'RGBuav_TP1_genotype.csv', 'RGBuav_TP3.csv']
Scottsbluff
Index(['Red_mean', 'Red_median', 'Red_sum', 'Green_mean', 'Green_median',
       'Green_sum', 'Blue_mean', 'Blue_median', 'Blue_sum', 'GLI_mean',
       'GLI_median', 'GLI_sum', 'NGRDI_mean', 'NGRDI_median', 'NGRDI_sum',
       'GRVI_mean', 'GRVI_median', 'GRVI_sum', 'yieldPerAcre', 'genotype'],
      dtype='object')
Index(['Red_mean', 'Red_median', 'Red_sum', 'Green_mean', 'Green_median',
       'Green_sum', 'Blue_mean', 'Blue_median', 'Blue_sum', 'GLI_mean',
       'GLI_median', 'GLI_sum', 'NGRDI_mean', 'NGRDI_median', 'NGRDI_sum',
       'GRVI_mean', 'GRVI_median', 'GRVI_sum', 'yieldPerAcre', 'genotype'],
      dtype='object')
Index(['Red_mean', 'Red_median', 'Red_sum', 'Green_mean', 'Green_median',
       'Green_sum', 'Blue_mean', 'Blue_median', 'Blue_sum', 'GLI_mean',
       'GLI_median', 'GLI_sum', 'NGRDI_mean', 'NGRDI_median', 'NGRDI

In [20]:
pd.DataFrame(Correlation).to_csv('AllTestCorrelation_newversion_uav.csv', index=False)

In [22]:
pd.DataFrame(Correlation)

Unnamed: 0,model,testset,r2
0,Scottsbluff_TP3,Scottsbluff_TP3,0.52158
1,Scottsbluff_TP3,Scottsbluff_TP2,0.00109
2,Scottsbluff_TP3,Scottsbluff_TP1,0.00747
3,Scottsbluff_TP3,Crawfordsville_TP3,0.00780
4,Scottsbluff_TP3,Crawfordsville_TP2,0.04089
...,...,...,...
220,Ames_TP1,Movalley_TP2,0.11781
221,Ames_TP1,Movalley_TP1,0.00228
222,Ames_TP1,Ames_TP3,0.05912
223,Ames_TP1,Ames_TP2,0.00075
