From c63bf14a85de93ed605f11f2d9d068771502d1d1 Mon Sep 17 00:00:00 2001 From: Ovid Date: Mon, 10 Dec 2012 12:49:46 +0100 Subject: [PATCH] Make the thresshold work (was a silly typo). Also, accidentally perltidied and added. Couldn't figure out a partial revert. --- Changes | 1 + bin/find_duplicate_perl | 2 + lib/Code/CutNPaste.pm | 79 +++++++++++++++++++++------------------ t/fixtures/MyBase.pm | 9 +++++ t/fixtures/MyBaseExact.pm | 16 +++++++- 5 files changed, 69 insertions(+), 38 deletions(-) diff --git a/Changes b/Changes index 58699b6..8bbb87d 100644 --- a/Changes +++ b/Changes @@ -5,6 +5,7 @@ Revision history for Code-CutNPaste - Fix bug where we sometimes try to read the munged file before it's written (the joys of parallel code!) - Cleaned up report format in find_duplicate_perl binary + - Make the thresshold work 0.01 December 9, 2012 Find duplicate code diff --git a/bin/find_duplicate_perl b/bin/find_duplicate_perl index 412d139..125e16f 100644 --- a/bin/find_duplicate_perl +++ b/bin/find_duplicate_perl @@ -11,6 +11,7 @@ GetOptions( 'ignore=s@' => \my @ignore, 'show_warnings' => \my $show_warnings, 'jobs=i' => \( my $jobs = 1 ), + 'threshhold=s' => \my $threshhold, ) or die "Bad options"; my ( @dirs, @files ); @@ -45,6 +46,7 @@ my $cutnpaste = Code::CutNPaste->new( verbose => 1, show_warnings => $show_warnings, jobs => $jobs, + threshhold => $threshhold, %renamed, ); my $duplicates = $cutnpaste->duplicates; diff --git a/lib/Code/CutNPaste.pm b/lib/Code/CutNPaste.pm index 695cdbd..cc13864 100644 --- a/lib/Code/CutNPaste.pm +++ b/lib/Code/CutNPaste.pm @@ -26,6 +26,7 @@ has 'show_warnings' => ( is => 'ro' ); has 'threshhold' => ( is => 'ro', default => sub {.75}, + coerce => sub { return .75 unless defined $_[0] }, isa => sub { my $threshhold = 0 + shift; if ( $threshhold < 0 or $threshhold > 1 ) { @@ -161,7 +162,8 @@ sub BUILD { mkdir $cache_dir; } for my $dir ( @{ $self->dirs } ) { - my @files = grep { !/^\./ } + my @files = + grep { !/^\./ } File::Find::Rule->file->name( '*.pm', '*.t', '*.pl' )->in($dir); # XXX dups and subdirs? @@ -187,13 +189,15 @@ sub find_dups { my $fork = Parallel::ForkManager->new( $jobs || 1 ); - $fork->run_on_finish( sub { - my $duplicates = pop @_; - push @{ $self->_duplicates } => @$duplicates; - }); + $fork->run_on_finish( + sub { + my $duplicates = pop @_; + push @{ $self->_duplicates } => @$duplicates; + } + ); my @left_right; if ( $jobs > 1 ) { - my $files_per_job = int($num_files/$jobs); + my $files_per_job = int( $num_files / $jobs ); for ( 1 .. $jobs ) { if ( $_ < $jobs ) { push @left_right => splice @pairs, 0, $files_per_job; @@ -215,10 +219,10 @@ sub find_dups { ) if $self->verbose; my $count = 1; foreach my $next_files (@left_right) { - $progress->update($count++) if $self->verbose; + $progress->update( $count++ ) if $self->verbose; my $pid = $fork->start and next; - my $duplicates_found = $self->search_for_dups( @$next_files ); + my $duplicates_found = $self->search_for_dups(@$next_files); $fork->finish( 0, $duplicates_found ); } @@ -257,7 +261,7 @@ sub search_for_dups { # brute force is bad! my @duplicates_found; - LINE: foreach ( my $i = 0; $i < @$code1 - $window; $i++ ) { + LINE: foreach ( my $i = 0; $i < @$code1 - $window; $i++ ) { next LINE unless $in_second{ $code1->[$i]{key} }; my @code1 = @{$code1}[ $i .. $#$code1 ]; @@ -265,7 +269,7 @@ sub search_for_dups { my @code2 = @{$code2}[ $j .. $#$code2 ]; my $matches = 0; my $longest = 0; - WINDOW: foreach my $k ( 0 .. $#code1 ) { + WINDOW: foreach my $k ( 0 .. $#code1 ) { if ( $code1[$k]{key} eq $code2[$k]{key} ) { $matches++; my $length1 = length( $code1[$k]{code} ); @@ -281,36 +285,38 @@ sub search_for_dups { last WINDOW; } } + + # if too many lines don't meet our threshold level, don't report + # this block of code if ( $matches >= $window ) { - #if ( my $threshhold = $self->threshhold ) { - # my $total = 0; - # for ( 0 .. $matches - 1 ) { - # $total++ if $code1[$_]{code} =~ /w/; - # } - # if ( $threshhold > $total / $matches ) { - # $matches = 0; - # } - # for ( 0 .. $matches - 1 ) { - # $total++ if $code1[$_]{code} =~ /w/; - # } - #} + if ( my $threshhold = $self->threshhold ) { + my $total = 0; + for ( 0 .. $matches - 1 ) { + $total++ if $code1[$_]{code} =~ /\w/; + } + if ( $threshhold > $total / $matches ) { + $matches = 0; + } + } } if ( $matches >= $window ) { - my $line1 = 0 + $code1[0]{line}; - my $line2 = 0 + $code2[0]{line}; + my $line1 = $code1[0]{line}; + my $line2 = $code2[0]{line}; my ( $left, $right, $report ) = ( '', '', '' ); for ( 0 .. $matches - 1 ) { $left .= $code1[$_]{code} . "\n"; $right .= $code2[$_]{code} . "\n"; - my ( $line1, $line2 ) - = map { chomp; $_ } - ( $code1[$_]{code}, $code2[$_]{code} ); + my ( $line1, $line2 ) = + map { chomp; $_ } ( $code1[$_]{code}, $code2[$_]{code} ); $report .= $line1 . ( ' ' x ( $longest - length($line1) ) ); $report .= " | $line2\n"; } + + # Next duplicate report should start after this chunk of code $i += $matches; + my $ignore = $self->ignore; if ( $ignore and $report =~ /$ignore/ ) { next LINE; @@ -359,8 +365,8 @@ sub get_text { } else { my $stderr; - ( undef, $stderr, @contents ) - = capture {qx($^X -Ilib -MO=CutNPaste $file)}; + ( undef, $stderr, @contents ) = + capture {qx($^X -Ilib -MO=CutNPaste $file)}; undef $stderr if $stderr =~ /syntax OK/; if ( $stderr and !$self->_could_not_deparse->{$file} ) { warn "Problem when parsing $file: $stderr" @@ -371,8 +377,8 @@ sub get_text { local $ENV{RENAME_VARS} = $self->renamed_vars || 0; local $ENV{RENAME_SUBS} = $self->renamed_subs || 0; - ( undef, $stderr, @munged ) - = capture {qx($^X -Ilib -MO=CutNPaste $file)}; + ( undef, $stderr, @munged ) = + capture {qx($^X -Ilib -MO=CutNPaste $file)}; undef $stderr if $stderr =~ /syntax OK/; if ( $stderr and !$self->_could_not_deparse->{$file} ) { warn "\nProblem when parsing $file: $stderr" @@ -426,7 +432,7 @@ sub postfilter { my ( $self, $contents ) = @_; my @contents; - INDEX: for ( my $i = 0; $i < @$contents; $i++ ) { + INDEX: for ( my $i = 0; $i < @$contents; $i++ ) { if ( $contents->[$i]{code} =~ /^(\s*)BEGIN\s*{/ ) { # BEGIN { my $padding = $1; if ( $contents->[ $i + 1 ]{code} =~ /^$padding}/ ) { @@ -447,11 +453,11 @@ sub prefilter { ); my $skip = 0; - LINE: for ( my $i = 0; $i < @$contents; $i++ ) { + LINE: for ( my $i = 0; $i < @$contents; $i++ ) { local $_ = $contents->[$i]; next if /^\s*(?:use|require)\b/; # use/require next if /^\s*$/; # blank lines - next if /^#(?!line\s+[0-9]+)/; # comments which aren't line directives + next if /^#(?!line\s+[0-9]+)/; # comments which aren't line directives # Modules which import things create code like this: # @@ -531,8 +537,6 @@ our your CPU and speed up duplicate code detection. =head2 C -B - A number between 0 and 1. It represents a percentage. If a duplicate section of code is found, the percentage number of lines of code containing "word" characters must exceed the threshhold. This is done to prevent spurious @@ -544,6 +548,9 @@ reporting of chunks of code like this: } | } sub _confirm { | sub _execute { +The above code has on 40% of its lines containing word (C) characters, +and thus will not be reported. + =head1 TODO =over 4 diff --git a/t/fixtures/MyBase.pm b/t/fixtures/MyBase.pm index d785df4..fd38afa 100644 --- a/t/fixtures/MyBase.pm +++ b/t/fixtures/MyBase.pm @@ -1,3 +1,12 @@ +THRESHHOLD_TEST: for my $i ( 0 .. 1 ) { + for my $j ( 0 .. 1 ) { + for my $k ( 0 .. 1 ) { + if ($k) { + my $null = $i + $j + $k; + } + } + } +} sub add_line_numbers { my $contents = prefilter(shift); my $with_varnames = prefilter(shift); diff --git a/t/fixtures/MyBaseExact.pm b/t/fixtures/MyBaseExact.pm index e9d0ae5..20930ad 100644 --- a/t/fixtures/MyBaseExact.pm +++ b/t/fixtures/MyBaseExact.pm @@ -1,3 +1,15 @@ +THRESHHOLD_TEST: for my $i ( 0 .. 1 ) { + for my $j ( 0 .. 1 ) { + for my $k ( 0 .. 1 ) { + next unless $j; + if ($k) { + my $null = $i + $j + $k; + } + } + } +} +sub bar {7} + sub add_line_numbers { my $contents = prefilter(shift); my $with_varnames = prefilter(shift); @@ -9,7 +21,7 @@ sub add_line_numbers { ( $contents->[$i], $with_varnames->[$i] ); chomp $line_with_vars; - if ($line =~ /^#line\s+([0-9]+)/) { + if ( $line =~ /^#line\s+([0-9]+)/ ) { $line_num = $1; next; } @@ -20,7 +32,7 @@ sub add_line_numbers { }; $line_num++; } - return postfilter(\@contents); + return postfilter( \@contents ); } 1;