In [1]:
import os
import numpy as np
import pandas
import pandas as pd
from optparse import OptionParser
import sys
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

# 数据的预处理工作已经完成了的

# 数据进行分割处理：
### 等量分成11份，后续用于十折增量模型

def file_split(filename, file_num):
    # 获得每个文件需要有的行数
    chunksize = 18200   #先初始化的chunksize是100W
    data1 = pd.read_csv(filename, chunksize = chunksize ,encoding='ANSI') 
    num = 0
    for chunk in data1:
        num += len(chunk)
    chunksize = round(num / file_num + 1)

    # 需要存的file
    head, tail = os.path.splitext(filename)
    data2 = pd.read_csv(filename, chunksize = chunksize ,encoding='ANSI')
    i = 0 #定文件名
    for chunk in data2:
        chunk.to_csv('{0}_{1}{2}'.format(head, i, tail),index=False)
        print('保存第{0}个数据'.format(i))
        i += 1
    
if __name__ == '__main__':
    filename = './data/eclipse/Buginfo/buginfo.csv'
    file_split(filename, 11)

# 显示一下数据信息：

In [4]:
df=pd.read_csv('./data/eclipse/Buginfo/buginfo_part1.csv',encoding="ANSI")
df=df[['fixer','text']]
print("数据总量: %d ." % len(df))
df.head()

print("在fixer列中总共有 %d 个空值." % df.fixer.isnull().sum())
print("在text列中总共有 %d 个空值." % df.text.isnull().sum())
df[df.isnull().values==True]
df = df[pd.notnull(df.text)]

df_text=df.text.values.astype('U')

数据总量: 106065 .
在fixer列中总共有 0 个空值.
在text列中总共有 0 个空值.


# 特征处理:TF_IDF

In [6]:
vectorizer= TfidfVectorizer()

#一次导入X，y,test_size
X_train, X_test, y_train, y_test = train_test_split(df_text.tolist(), df.fixer, test_size=0.3)

#提取训练集特征
X_train_counts = vectorizer.fit_transform(X_train)
print("train:n_samples: %d, n_features: %d" % X_train_counts.shape)

#提取测试样本特征集
#df_test=pd.read_csv('./data/eclipse/Buginfo/buginfo.csv')
#corpus=df_test.text.tolist()
X_test_counts = vectorizer.transform(X_test)
print("test:n_samples: %d, n_features: %d" % X_test_counts.shape)

train:n_samples: 74245, n_features: 162437
test:n_samples: 31820, n_features: 162437


# 建立预测模型，对模型进行测试：

## 朴素贝叶斯模型：

In [7]:
estimator = MultinomialNB()

#这里是有监督的训练模型,所以要同时输入X_train_counts和y_train
estimator.fit(X_train_counts, y_train)

#训练好的模型，对测试集数据进行预测
predicted = estimator.predict(X_test_counts)

# 这里我们仅仅打印前10个测试数据的预测及其真实类别
for prediction, truth in zip(predicted[:10], y_test[:10]):
    print('%-30s%-20s' %(prediction,truth))

darin_swanson                 rodrigo             
daniel_megert                 ganoro              
aeschli                       frederic_fusier     
daniel_megert                 christophe.cornu+eclipse
aeschli                       aeschli             
michael_valenta               michael_valenta     
darin_swanson                 darin_swanson       
wassim.melhem                 nickboldt           
mik.kersten                   mik.kersten         
nickboldt                     nickboldt           


In [8]:
#检验贝叶斯模型的准确率
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

#按照分类labels分类输出准确率召回率F1值
print(classification_report(y_test, predicted, labels=df.fixer.unique()))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

            claude_knaus       0.00      0.00      0.00       106
         jerome_lanneluc       0.96      0.17      0.29       275
                 akiezun       1.00      0.02      0.04       189
          philippe_mulet       0.78      0.38      0.51       288
         kai-uwe_maetzel       1.00      0.00      0.01       258
            dirk_baeumer       1.00      0.01      0.02       331
             jared_burns       0.00      0.00      0.00       194
             erich_gamma       0.00      0.00      0.00       161
           darin_swanson       0.69      0.61      0.65       647
           darin.eclipse       0.11      0.98      0.20       799
             james_moody       0.00      0.00      0.00       135
                 unknown       0.00      0.00      0.00        22
              nick_edgar       0.00      0.00      0.00       183
                 aeschli       0.15      0.89      0.25       770
     jean

## k邻近算法：

In [9]:
neigh=KNeighborsClassifier()

neigh.fit(X_train_counts, y_train)

#训练好的模型，对测试集数据进行预测
predicted = neigh.predict(X_test_counts)

# 这里我们仅仅打印前10个测试数据的预测及其真实类别
for prediction, truth in zip(predicted[:10], y_test[:10]):
    print('%-30s%-20s' %(prediction,truth))

simon_arsenault               rodrigo             
ganoro                        ganoro              
wwang                         frederic_fusier     
christophe.cornu+eclipse      christophe.cornu+eclipse
aeschli                       aeschli             
mik.kersten                   michael_valenta     
darin_swanson                 darin_swanson       
wassim.melhem                 nickboldt           
mik.kersten                   mik.kersten         
nickboldt                     nickboldt           


In [10]:
#按照分类labels分类输出准确率召回率F1值
print(classification_report(y_test, predicted, labels=df.fixer.unique()))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

            claude_knaus       0.37      0.40      0.38       106
         jerome_lanneluc       0.55      0.47      0.51       275
                 akiezun       0.44      0.71      0.55       189
          philippe_mulet       0.59      0.62      0.60       288
         kai-uwe_maetzel       0.53      0.37      0.44       258
            dirk_baeumer       0.47      0.50      0.49       331
             jared_burns       0.57      0.35      0.43       194
             erich_gamma       0.41      0.32      0.36       161
           darin_swanson       0.60      0.73      0.66       647
           darin.eclipse       0.51      0.73      0.60       799
             james_moody       0.68      0.55      0.61       135
                 unknown       0.25      0.05      0.08        22
              nick_edgar       0.56      0.45      0.50       183
                 aeschli       0.33      0.59      0.43       770
     jean

## 决策树算法：

In [11]:
from sklearn.tree import DecisionTreeClassifier

dtree=DecisionTreeClassifier()

dtree.fit(X_train_counts, y_train)

#训练好的模型，对测试集数据进行预测
predicted = dtree.predict(X_test_counts)

# 这里我们仅仅打印前10个测试数据的预测及其真实类别
for prediction, truth in zip(predicted[:10], y_test[:10]):
    print('%-30s%-20s' %(prediction,truth))
    
#按照分类labels分类输出准确率召回率F1值
print(classification_report(y_test, predicted, labels=df.fixer.unique()))

claude_knaus                  rodrigo             
ganoro                        ganoro              
david_audel                   frederic_fusier     
christophe.cornu+eclipse      christophe.cornu+eclipse
andre_weinand                 aeschli             
kevin_mcguire                 michael_valenta     
darin_swanson                 darin_swanson       
nickboldt                     nickboldt           
mik.kersten                   mik.kersten         
nickboldt                     nickboldt           


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

            claude_knaus       0.68      0.71      0.69       106
         jerome_lanneluc       0.65      0.71      0.67       275
                 akiezun       0.64      0.67      0.65       189
          philippe_mulet       0.63      0.69      0.66       288
         kai-uwe_maetzel       0.75      0.79      0.77       258
            dirk_baeumer       0.55      0.59      0.57       331
             jared_burns       0.60      0.63      0.62       194
             erich_gamma       0.47      0.43      0.45       161
           darin_swanson       0.83      0.83      0.83       647
           darin.eclipse       0.78      0.78      0.78       799
             james_moody       0.69      0.73      0.71       135
                 unknown       0.33      0.23      0.27        22
              nick_edgar       0.49      0.52      0.50       183
                 aeschli       0.74      0.76      0.75       770
     jean

## 随机森林：

In [13]:
randomforest= RandomForestClassifier()
randomforest.fit(X_train_counts, y_train)

#训练好的模型，对测试集数据进行预测
predicted = randomforest.predict(X_test_counts)

# 这里我们仅仅打印前10个测试数据的预测及其真实类别
for prediction, truth in zip(predicted[:10], y_test[:10]):
    print('%-30s%-20s' %(prediction,truth))
    
#按照分类labels分类输出准确率召回率F1值
print(classification_report(y_test, predicted, labels=df.fixer.unique()))

darin_swanson                 rodrigo             
ganoro                        ganoro              
david_audel                   frederic_fusier     
christophe.cornu+eclipse      christophe.cornu+eclipse
aeschli                       aeschli             
mik.kersten                   michael_valenta     
darin_swanson                 darin_swanson       
nickboldt                     nickboldt           
mik.kersten                   mik.kersten         
nickboldt                     nickboldt           


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

            claude_knaus       0.84      0.85      0.85       106
         jerome_lanneluc       0.68      0.93      0.78       275
                 akiezun       0.78      0.74      0.76       189
          philippe_mulet       0.74      0.81      0.77       288
         kai-uwe_maetzel       0.78      0.91      0.84       258
            dirk_baeumer       0.64      0.74      0.69       331
             jared_burns       0.72      0.45      0.56       194
             erich_gamma       0.81      0.49      0.61       161
           darin_swanson       0.75      0.87      0.81       647
           darin.eclipse       0.60      0.92      0.72       799
             james_moody       0.86      0.64      0.74       135
                 unknown       0.00      0.00      0.00        22
              nick_edgar       0.76      0.64      0.70       183
                 aeschli       0.66      0.93      0.77       770
     jean

# 逻辑回归：

In [None]:
from sklearn.linear_model import LogisticRegression
logisticreg= LogisticRegression(random_state=0)
logisticreg.fit(X_train_counts, y_train)

#训练好的模型，对测试集数据进行预测
predicted = logisticreg.predict(X_test_counts)

# 这里我们仅仅打印前10个测试数据的预测及其真实类别
for prediction, truth in zip(predicted[:10], y_test[:10]):
    print('%-30s%-20s' %(prediction,truth))
    
#按照分类labels分类输出准确率召回率F1值
print(classification_report(y_test, predicted, labels=df.fixer.unique()))